pax_global_header00006660000000000000000000000064143461477540014531gustar00rootroot0000000000000052 comment=7106f901990803ca512cd7d9e6d7d2782f2c4839 cub-2.0.1/000077500000000000000000000000001434614775400123025ustar00rootroot00000000000000cub-2.0.1/.git-blame-ignore-revs000066400000000000000000000006721434614775400164070ustar00rootroot00000000000000# Exclude these commits from git-blame and similar tools. # # To use this file, run the following command from the repo root: # # ``` # $ git config blame.ignoreRevsFile .git-blame-ignore-revs # ``` # # Include a brief comment with each commit added, for example: # # ``` # d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format # ``` # # Only add commits that are pure formatting changes (e.g. # clang-format version changes, etc). cub-2.0.1/.github/000077500000000000000000000000001434614775400136425ustar00rootroot00000000000000cub-2.0.1/.github/workflows/000077500000000000000000000000001434614775400156775ustar00rootroot00000000000000cub-2.0.1/.github/workflows/mirror-main-branch-to-master-branch.yml000066400000000000000000000006051434614775400252560ustar00rootroot00000000000000on: push: branches: - "main" jobs: mirror-main-branch-to-master-branch: name: Mirror main branch to master branch runs-on: ubuntu-latest steps: - name: Mirror main branch to master branch id: mirror uses: google/mirror-branch-action@v1.0 with: source: "main" dest: "master" github-token: ${{ secrets.GITHUB_TOKEN }} cub-2.0.1/.github/workflows/push-to-legacy-repositories.yml000066400000000000000000000026671434614775400240230ustar00rootroot00000000000000on: push jobs: push-to-legacy-repositories: name: Push to legacy repositories runs-on: ubuntu-latest steps: - name: Push `main` to github.com/nvlabs/cub uses: wei/git-sync@v2 if: github.repository == 'nvidia/cub' with: source_repo: "nvidia/cub" source_branch: "main" destination_repo: "nvlabs/cub" destination_branch: "main" ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} - name: Push all tags to github.com/nvlabs/cub uses: wei/git-sync@v2 if: github.repository == 'nvidia/cub' with: source_repo: "nvidia/cub" source_branch: "refs/tags/*" destination_repo: "nvlabs/cub" destination_branch: "refs/tags/*" ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} - name: Push `main` to github.com/thrust/cub uses: wei/git-sync@v2 if: github.repository == 'nvidia/cub' with: source_repo: "nvidia/cub" source_branch: "main" destination_repo: "thrust/cub" destination_branch: "main" ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} - name: Push all tags to github.com/thrust/cub uses: wei/git-sync@v2 if: github.repository == 'nvidia/cub' with: source_repo: "nvidia/cub" source_branch: "refs/tags/*" destination_repo: "thrust/cub" destination_branch: "refs/tags/*" ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} cub-2.0.1/.gitignore000066400000000000000000000000301434614775400142630ustar00rootroot00000000000000.p4config *~ \#* /build cub-2.0.1/CHANGELOG.md000066400000000000000000001726561434614775400141340ustar00rootroot00000000000000# CUB 2.0.0 ## Summary The CUB 2.0.0 major release adds a dependency on libcu++ and contains several breaking changes. These include new diagnostics when inspecting device-only lambdas from the host, an updated method of determining accumulator types for algorithms like Reduce and Scan, and a compile-time replacement for the runtime `debug_synchronous` debugging flags. This release also includes several new features. `DeviceHistogram` now supports `__half` and better handles various edge cases. `WarpReduce` now performs correctly when restricted to a single-thread “warp”, and will use the `__reduce_add_sync` accelerated intrinsic (introduced with Ampere) when appropriate. `DeviceRadixSort` learned to handle the case where `begin_bit == end_bit`. Several algorithms also have updated documentation, with a particular focus on clarifying which operations can and cannot be performed in-place. ## Breaking Changes - NVIDIA/cub#448 Add libcu++ dependency (v1.8.0+). - NVIDIA/cub#448: The following macros are no longer defined by default. They can be re-enabled by defining `CUB_PROVIDE_LEGACY_ARCH_MACROS`. These will be completely removed in a future release. - `CUB_IS_HOST_CODE`: Replace with `NV_IF_TARGET`. - `CUB_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`. - `CUB_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`. - `CUB_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`. - NVIDIA/cub#486: CUB’s CUDA Runtime support macros have been updated to support `NV_IF_TARGET`. They are now defined consistently across all host/device compilation passes. This should not affect most usages of these macros, but may require changes for some edge cases. - `CUB_RUNTIME_FUNCTION`: Execution space annotations for functions that invoke CUDA Runtime APIs. - Old behavior: - RDC enabled: Defined to `__host__ __device__` - RDC not enabled: - NVCC host pass: Defined to `__host__ __device__` - NVCC device pass: Defined to `__host__` - New behavior: - RDC enabled: Defined to `__host__ __device__` - RDC not enabled: Defined to `__host__` - `CUB_RUNTIME_ENABLED`: No change in behavior, but no longer used in CUB. Provided for legacy support only. Legacy behavior: - RDC enabled: Macro is defined. - RDC not enabled: - NVCC host pass: Macro is defined. - NVCC device pass: Macro is not defined. - `CUB_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to replace most usages of `CUB_RUNTIME_ENABLED`. Behavior: - RDC enabled: Macro is defined. - RDC not enabled: Macro is not defined. - NVIDIA/cub#509: A compile-time error is now emitted when a `__device__`-only lambda’s return type is queried from host code (requires libcu++ ≥ 1.9.0). - Due to limitations in the CUDA programming model, the result of this query is unreliable, and will silently return an incorrect result. This leads to difficult to debug errors. - When using libcu++ 1.9.0, an error will be emitted with information about work-arounds: - Use a named function object with a `__device__`-only implementation of `operator()`. - Use a `__host__ __device__` lambda. - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0) - NVIDIA/cub#509: Use the result type of the binary reduction operator for accumulating intermediate results in the `DeviceReduce` algorithm, following guidance from http://wg21.link/P2322R6. - This change requires host-side introspection of the binary operator’s signature, and device-only extended lambda functions can no longer be used. - In addition to the behavioral changes, the interfaces for the `Dispatch*Reduce` layer have changed: - `DispatchReduce`: - Now accepts accumulator type as last parameter. - Now accepts initializer type instead of output iterator value type. - Constructor now accepts `init` as initial type instead of output iterator value type. - `DispatchSegmentedReduce`: - Accepts accumulator type as last parameter. - Accepts initializer type instead of output iterator value type. - Thread operators now accept parameters using different types: `Equality` , `Inequality`, `InequalityWrapper`, `Sum`, `Difference`, `Division`, `Max` , `ArgMax`, `Min`, `ArgMin`. - `ThreadReduce` now accepts accumulator type and uses a different type for `prefix`. - NVIDIA/cub#511: Use the result type of the binary operator for accumulating intermediate results in the `DeviceScan`, `DeviceScanByKey`, and `DeviceReduceByKey` algorithms, following guidance from http://wg21.link/P2322R6. - This change requires host-side introspection of the binary operator’s signature, and device-only extended lambda functions can no longer be used. - In addition to the behavioral changes, the interfaces for the `Dispatch` layer have changed: - `DispatchScan`now accepts accumulator type as a template parameter. - `DispatchScanByKey`now accepts accumulator type as a template parameter. - `DispatchReduceByKey`now accepts accumulator type as the last template parameter. - NVIDIA/cub#527: Deprecate the `debug_synchronous` flags on device algorithms. - This flag no longer has any effect. Define `CUB_DEBUG_SYNC` during compilation to enable these checks. - Moving this option from run-time to compile-time avoids the compilation overhead of unused debugging paths in production code. ## New Features - NVIDIA/cub#514: Support `__half` in `DeviceHistogram`. - NVIDIA/cub#516: Add support for single-threaded invocations of `WarpReduce`. - NVIDIA/cub#516: Use `__reduce_add_sync` hardware acceleration for `WarpReduce` on supported architectures. ## Bug Fixes - NVIDIA/cub#481: Fix the device-wide radix sort implementations to simply copy the input to the output when `begin_bit == end_bit`. - NVIDIA/cub#487: Fix `DeviceHistogram::Even` for a variety of edge cases: - Bin ids are now correctly computed when mixing different types for `SampleT` and `LevelT`. - Bin ids are now correctly computed when `LevelT` is an integral type and the number of levels does not evenly divide the level range. - NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in the `AdjacentDifferenceCopy` device algorithms. - NVIDIA/cub#508: Remove excessive calls to the binary operator given to the `AdjacentDifferenceCopy` device algorithms. - NVIDIA/cub#533: Fix debugging utilities when RDC is disabled. ## Other Enhancements - NVIDIA/cub#448: Removed special case code for unsupported CUDA architectures. - NVIDIA/cub#448: Replace several usages of `__CUDA_ARCH__` with `` to handle host/device code divergence. - NVIDIA/cub#448: Mark unused PTX arch parameters as legacy. - NVIDIA/cub#476: Enabled additional debug logging for the onesweep radix sort implementation. Thanks to @canonizer for this contribution. - NVIDIA/cub#480: Add `CUB_DISABLE_BF16_SUPPORT` to avoid including the `cuda_bf16.h` header or using the `__nv_bfloat16` type. - NVIDIA/cub#486: Add debug log messages for post-kernel debug synchronizations. - NVIDIA/cub#490: Clarify documentation for in-place usage of `DeviceScan` algorithms. - NVIDIA/cub#494: Clarify documentation for in-place usage of `DeviceHistogram` algorithms. - NVIDIA/cub#495: Clarify documentation for in-place usage of `DevicePartition` algorithms. - NVIDIA/cub#499: Clarify documentation for in-place usage of `Device*Sort` algorithms. - NVIDIA/cub#500: Clarify documentation for in-place usage of `DeviceReduce` algorithms. - NVIDIA/cub#501: Clarify documentation for in-place usage of `DeviceRunLengthEncode` algorithms. - NVIDIA/cub#503: Clarify documentation for in-place usage of `DeviceSelect` algorithms. - NVIDIA/cub#518: Fix typo in `WarpMergeSort` documentation. - NVIDIA/cub#519: Clarify segmented sort documentation regarding the handling of elements that are not included in any segment. # CUB 1.17.1 ## Summary CUB 1.17.1 is a minor bugfix release. - NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in the `AdjacentDifferenceCopy` device algorithms. - NVIDIA/cub#508: Remove excessive calls to the binary operator given to the `AdjacentDifferenceCopy` device algorithms. - Fix device-side debug synchronous behavior in `DeviceSegmentedSort`. # CUB 1.17.0 ## Summary CUB 1.17.0 is the final minor release of the 1.X series. It provides a variety of bug fixes and miscellaneous enhancements, detailed below. ## Known Issues ### "Run-to-run" Determinism Broken Several CUB device algorithms are documented to provide deterministic results (per device) for non-associative reduction operators (e.g. floating-point addition). Unfortunately, the implementations of these algorithms contain performance optimizations that violate this guarantee. The `DeviceReduce::ReduceByKey` and `DeviceScan` algorithms are known to be affected. We’re currently evaluating the scope and impact of correcting this in a future CUB release. See NVIDIA/cub#471 for details. ## Bug Fixes - NVIDIA/cub#444: Fixed `DeviceSelect` to work with discard iterators and mixed input/output types. - NVIDIA/cub#452: Fixed install issue when `CMAKE_INSTALL_LIBDIR` contained nested directories. Thanks to @robertmaynard for this contribution. - NVIDIA/cub#462: Fixed bug that produced incorrect results from `DeviceSegmentedSort` on sm_61 and sm_70. - NVIDIA/cub#464: Fixed `DeviceSelect::Flagged` so that flags are normalized to 0 or 1. - NVIDIA/cub#468: Fixed overflow issues in `DeviceRadixSort` given `num_items` close to 2^32. Thanks to @canonizer for this contribution. - NVIDIA/cub#498: Fixed compiler regression in `BlockAdjacentDifference`. Thanks to @MKKnorr for this contribution. ## Other Enhancements - NVIDIA/cub#445: Remove device-sync in `DeviceSegmentedSort` when launched via CDP. - NVIDIA/cub#449: Fixed invalid link in documentation. Thanks to @kshitij12345 for this contribution. - NVIDIA/cub#450: `BlockDiscontinuity`: Replaced recursive-template loop unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this contribution. - NVIDIA/cub#451: Replaced the deprecated `TexRefInputIterator` implementation with an alias to `TexObjInputIterator`. This fully removes all usages of the deprecated CUDA texture reference APIs from CUB. - NVIDIA/cub#456: `BlockAdjacentDifference`: Replaced recursive-template loop unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this contribution. - NVIDIA/cub#466: `cub::DeviceAdjacentDifference` API has been updated to use the new `OffsetT` deduction approach described in NVIDIA/cub#212. - NVIDIA/cub#470: Fix several doxygen-related warnings. Thanks to @karthikeyann for this contribution. # CUB 1.16.0 ## Summary CUB 1.16.0 is a major release providing several improvements to the device scope algorithms. `DeviceRadixSort` now supports large (64-bit indexed) input data. A new `UniqueByKey` algorithm has been added to `DeviceSelect`. `DeviceAdjacentDifference` provides new `SubtractLeft` and `SubtractRight` functionality. This release also deprecates several obsolete APIs, including type traits and `BlockAdjacentDifference` algorithms. Many bugfixes and documentation updates are also included. ### 64-bit Offsets in `DeviceRadixSort` Public APIs Users frequently want to process large datasets using CUB’s device-scope algorithms, but the current public APIs limit input data sizes to those that can be indexed by a 32-bit integer. Beginning with this release, CUB is updating these APIs to support 64-bit offsets, as discussed in NVIDIA/cub#212. The device-scope algorithms will be updated with 64-bit offset support incrementally, starting with the `cub::DeviceRadixSort` family of algorithms. Thanks to @canonizer for contributing this functionality. ### New `DeviceSelect::UniqueByKey` Algorithm `cub::DeviceSelect` now provides a `UniqueByKey` algorithm, which has been ported from Thrust. Thanks to @zasdfgbnm for this contribution. ### New `DeviceAdjacentDifference` Algorithms The new `cub::DeviceAdjacentDifference` interface, also ported from Thrust, provides `SubtractLeft` and `SubtractRight` algorithms as CUB kernels. ## Deprecation Notices ### Synchronous CUDA Dynamic Parallelism Support **A future version of CUB will change the `debug_synchronous` behavior of device-scope algorithms when invoked via CUDA Dynamic Parallelism (CDP).** This will only affect calls to CUB device-scope algorithms launched from device-side code with `debug_synchronous = true`. Such invocations will continue to print extra debugging information, but they will no longer synchronize after kernel launches. ### Deprecated Traits CUB provided a variety of metaprogramming type traits in order to support C++03. Since C++14 is now required, these traits have been deprecated in favor of their STL equivalents, as shown below: | Deprecated CUB Trait | Replacement STL Trait | |-----------------------|-----------------------| | cub::If | std::conditional | | cub::Equals | std::is_same | | cub::IsPointer | std::is_pointer | | cub::IsVolatile | std::is_volatile | | cub::RemoveQualifiers | std::remove_cv | | cub::EnableIf | std::enable_if | CUB now uses the STL traits internally, resulting in a ~6% improvement in compile time. ### Misnamed `cub::BlockAdjacentDifference` APIs The algorithms in `cub::BlockAdjacentDifference` have been deprecated, as their names did not clearly describe their intent. The `FlagHeads` method is now `SubtractLeft`, and `FlagTails` has been replaced by `SubtractRight`. ## Breaking Changes - NVIDIA/cub#331: Deprecate the misnamed `BlockAdjacentDifference::FlagHeads` and `FlagTails` methods. Use the new `SubtractLeft` and `SubtractRight` methods instead. - NVIDIA/cub#364: Deprecate some obsolete type traits. These should be replaced by the equivalent traits in `` as described above. ## New Features - NVIDIA/cub#331: Port the `thrust::adjacent_difference` kernel and expose it as `cub::DeviceAdjacentDifference`. - NVIDIA/cub#405: Port the `thrust::unique_by_key` kernel and expose it as `cub::DeviceSelect::UniqueByKey`. Thanks to @zasdfgbnm for this contribution. ## Enhancements - NVIDIA/cub#340: Allow 64-bit offsets in `DeviceRadixSort` public APIs. Thanks to @canonizer for this contribution. - NVIDIA/cub#400: Implement a significant reduction in `DeviceMergeSort` compilation time. - NVIDIA/cub#415: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in Thrust’s CMake install rules. Thanks for @robertmaynard for this contribution. ## Bug Fixes - NVIDIA/cub#381: Fix shared memory alignment in `dyn_smem` example. - NVIDIA/cub#393: Fix some collisions with the `min`/`max` macros defined in `windows.h`. - NVIDIA/cub#404: Fix bad cast in `util_device`. - NVIDIA/cub#410: Fix CDP issues in `DeviceSegmentedSort`. - NVIDIA/cub#411: Ensure that the `nv_exec_check_disable` pragma is only used on nvcc. - NVIDIA/cub#418: Fix `-Wsizeof-array-div` warning on gcc 11. Thanks to @robertmaynard for this contribution. - NVIDIA/cub#420: Fix new uninitialized variable warning in `DiscardIterator` on gcc 10. - NVIDIA/cub#423: Fix some collisions with the `small` macro defined in `windows.h`. - NVIDIA/cub#426: Fix some issues with version handling in CUB’s CMake packages. - NVIDIA/cub#430: Remove documentation for `DeviceSpmv` parameters that are absent from public APIs. - NVIDIA/cub#432: Remove incorrect documentation for `DeviceScan` algorithms that guaranteed run-to-run deterministic results for floating-point addition. # CUB 1.15.0 (NVIDIA HPC SDK 22.1, CUDA Toolkit 11.6) ## Summary CUB 1.15.0 includes a new `cub::DeviceSegmentedSort` algorithm, which demonstrates up to 5000x speedup compared to `cub::DeviceSegmentedRadixSort` when sorting a large number of small segments. A new `cub::FutureValue` helper allows the `cub::DeviceScan` algorithms to lazily load the `initial_value` from a pointer. `cub::DeviceScan` also added `ScanByKey` functionality. The new `DeviceSegmentedSort` algorithm partitions segments into size groups. Each group is processed with specialized kernels using a variety of sorting algorithms. This approach varies the number of threads allocated for sorting each segment and utilizes the GPU more efficiently. `cub::FutureValue` provides the ability to use the result of a previous kernel as a scalar input to a CUB device-scope algorithm without unnecessary synchronization: ```cpp int *d_intermediate_result = ...; intermediate_kernel<<>>(d_intermediate_result, // output arg1, // input arg2); // input // Wrap the intermediate pointer in a FutureValue -- no need to explicitly // sync when both kernels are stream-ordered. The pointer is read after // the ExclusiveScan kernel starts executing. cub::FutureValue init_value(d_intermediate_result); cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, cub::Sum(), init_value, num_items); ``` Previously, an explicit synchronization would have been necessary to obtain the intermediate result, which was passed by value into ExclusiveScan. This new feature enables better performance in workflows that use cub::DeviceScan. ## Deprecation Notices **A future version of CUB will change the `debug_synchronous` behavior of device-scope algorithms when invoked via CUDA Dynamic Parallelism (CDP).** This will only affect calls to CUB device-scope algorithms launched from device-side code with `debug_synchronous = true`. These algorithms will continue to print extra debugging information, but they will no longer synchronize after kernel launches. ## Breaking Changes - NVIDIA/cub#305: The template parameters of `cub::DispatchScan` have changed to support the new `cub::FutureValue` helper. More details under "New Features". - NVIDIA/cub#377: Remove broken `operator->()` from `cub::TransformInputIterator`, since this cannot be implemented without returning a temporary object's address. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. ## New Features - NVIDIA/cub#305: Add overloads to `cub::DeviceScan` algorithms that allow the output of a previous kernel to be used as `initial_value` without explicit synchronization. See the new `cub::FutureValue` helper for details. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. - NVIDIA/cub#354: Add `cub::BlockRunLengthDecode` algorithm. Thanks to Elias Stehle (@elstehle) for this contribution. - NVIDIA/cub#357: Add `cub::DeviceSegmentedSort`, an optimized version of `cub::DeviceSegmentedSort` with improved load balancing and small array performance. - NVIDIA/cub#376: Add "by key" overloads to `cub::DeviceScan`. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. ## Bug Fixes - NVIDIA/cub#349: Doxygen and unused variable fixes. - NVIDIA/cub#363: Maintenance updates for the new `cub::DeviceMergeSort` algorithms. - NVIDIA/cub#382: Fix several `-Wconversion` warnings. Thanks to Matt Stack (@matt-stack) for this contribution. - NVIDIA/cub#388: Fix debug assertion on MSVC when using `cub::CachingDeviceAllocator`. - NVIDIA/cub#395: Support building with `__CUDA_NO_HALF_CONVERSIONS__`. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. # CUB 1.14.0 (NVIDIA HPC SDK 21.9) ## Summary CUB 1.14.0 is a major release accompanying the NVIDIA HPC SDK 21.9. This release provides the often-requested merge sort algorithm, ported from the `thrust::sort` implementation. Merge sort provides more flexibility than the existing radix sort by supporting arbitrary data types and comparators, though radix sorting is still faster for supported inputs. This functionality is provided through the new `cub::DeviceMergeSort` and `cub::BlockMergeSort` algorithms. The namespace wrapping mechanism has been overhauled for 1.14. The existing macros (`CUB_NS_PREFIX`/`CUB_NS_POSTFIX`) can now be replaced by a single macro, `CUB_WRAPPED_NAMESPACE`, which is set to the name of the desired wrapped namespace. Defining a similar `THRUST_CUB_WRAPPED_NAMESPACE` macro will embed both `thrust::` and `cub::` symbols in the same external namespace. The prefix/postfix macros are still supported, but now require a new `CUB_NS_QUALIFIER` macro to be defined, which provides the fully qualified CUB namespace (e.g. `::foo::cub`). See `cub/util_namespace.cuh` for details. ## Breaking Changes - NVIDIA/cub#350: When the `CUB_NS_[PRE|POST]FIX` macros are set, `CUB_NS_QUALIFIER` must also be defined to the fully qualified CUB namespace (e.g. `#define CUB_NS_QUALIFIER ::foo::cub`). Note that this is handled automatically when using the new `[THRUST_]CUB_WRAPPED_NAMESPACE` mechanism. ## New Features - NVIDIA/cub#322: Ported the merge sort algorithm from Thrust: `cub::BlockMergeSort` and `cub::DeviceMergeSort` are now available. - NVIDIA/cub#326: Simplify the namespace wrapper macros, and detect when Thrust's symbols are in a wrapped namespace. ## Bug Fixes - NVIDIA/cub#160, NVIDIA/cub#163, NVIDIA/cub#352: Fixed several bugs in `cub::DeviceSpmv` and added basic tests for this algorithm. Thanks to James Wyles and Seunghwa Kang for their contributions. - NVIDIA/cub#328: Fixed error handling bug and incorrect debugging output in `cub::CachingDeviceAllocator`. Thanks to Felix Kallenborn for this contribution. - NVIDIA/cub#335: Fixed a compile error affecting clang and NVRTC. Thanks to Jiading Guo for this contribution. - NVIDIA/cub#351: Fixed some errors in the `cub::DeviceHistogram` documentation. ## Enhancements - NVIDIA/cub#348: Add an example that demonstrates how to use dynamic shared memory with a CUB block algorithm. Thanks to Matthias Jouanneaux for this contribution. # CUB 1.13.1 (CUDA Toolkit 11.5) CUB 1.13.1 is a minor release accompanying the CUDA Toolkit 11.5. This release provides a new hook for embedding the `cub::` namespace inside a custom namespace. This is intended to work around various issues related to linking multiple shared libraries that use CUB. The existing `CUB_NS_PREFIX` and `CUB_NS_POSTFIX` macros already provided this capability; this update provides a simpler mechanism that is extended to and integrated with Thrust. Simply define `THRUST_CUB_WRAPPED_NAMESPACE` to a namespace name, and both `thrust::` and `cub::` will be placed inside the new namespace. Using different wrapped namespaces for each shared library will prevent issues like those reported in NVIDIA/thrust#1401. ## New Features - NVIDIA/cub#326: Add `THRUST_CUB_WRAPPED_NAMESPACE` hooks. # CUB 1.13.0 (NVIDIA HPC SDK 21.7) CUB 1.13.0 is the major release accompanying the NVIDIA HPC SDK 21.7 release. Notable new features include support for striped data arrangements in block load/store utilities, `bfloat16` radix sort support, and fewer restrictions on offset iterators in segmented device algorithms. Several bugs in `cub::BlockShuffle`, `cub::BlockDiscontinuity`, and `cub::DeviceHistogram` have been addressed. The amount of code generated in `cub::DeviceScan` has been greatly reduced, leading to significant compile-time improvements when targeting multiple PTX architectures. This release also includes several user-contributed documentation fixes that will be reflected in CUB's online documentation in the coming weeks. ## Breaking Changes - NVIDIA/cub#320: Deprecated `cub::TexRefInputIterator`. Use `cub::TexObjInputIterator` as a replacement. ## New Features - NVIDIA/cub#274: Add `BLOCK_LOAD_STRIPED` and `BLOCK_STORE_STRIPED` functionality to `cub::BlockLoadAlgorithm` and `cub::BlockStoreAlgorithm`. Thanks to Matthew Nicely (@mnicely) for this contribution. - NVIDIA/cub#291: `cub::DeviceSegmentedRadixSort` and `cub::DeviceSegmentedReduce` now support different types for begin/end offset iterators. Thanks to Sergey Pavlov (@psvvsp) for this contribution. - NVIDIA/cub#306: Add `bfloat16` support to `cub::DeviceRadixSort`. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. - NVIDIA/cub#320: Introduce a new `CUB_IGNORE_DEPRECATED_API` macro that disables deprecation warnings on Thrust and CUB APIs. ## Bug Fixes - NVIDIA/cub#277: Fixed sanitizer warnings in `RadixSortScanBinsKernels`. Thanks to Andy Adinets (@canonizer) for this contribution. - NVIDIA/cub#287: `cub::DeviceHistogram` now correctly handles cases where `OffsetT` is not an `int`. Thanks to Dominique LaSalle (@nv-dlasalle) for this contribution. - NVIDIA/cub#311: Fixed several bugs and added tests for the `cub::BlockShuffle` collective operations. - NVIDIA/cub#312: Eliminate unnecessary kernel instantiations when compiling `cub::DeviceScan`. Thanks to Elias Stehle (@elstehle) for this contribution. - NVIDIA/cub#319: Fixed out-of-bounds memory access on debugging builds of `cub::BlockDiscontinuity::FlagHeadsAndTails`. - NVIDIA/cub#320: Fixed harmless missing return statement warning in unreachable `cub::TexObjInputIterator` code path. ## Other Enhancements - Several documentation fixes are included in this release. - NVIDIA/cub#275: Fixed comments describing the `cub::If` and `cub::Equals` utilities. Thanks to Rukshan Jayasekara (@rukshan99) for this contribution. - NVIDIA/cub#290: Documented that `cub::DeviceSegmentedReduce` will produce consistent results run-to-run on the same device for pseudo-associated reduction operators. Thanks to Himanshu (@himanshu007-creator) for this contribution. - NVIDIA/cub#298: `CONTRIBUTING.md` now refers to Thrust's build instructions for developer builds, which is the preferred way to build the CUB test harness. Thanks to Xiang Gao (@zasdfgbnm) for contributing. - NVIDIA/cub#301: Expand `cub::DeviceScan` documentation to include in-place support and add tests. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. - NVIDIA/cub#307: Expand `cub::DeviceRadixSort` and `cub::BlockRadixSort` documentation to clarify stability, in-place support, and type-specific bitwise transformations. Thanks to Himanshu (@himanshu007-creator) for contributing. - NVIDIA/cub#316: Move `WARP_TIME_SLICING` documentation to the correct location. Thanks to Peter Han (@peter9606) for this contribution. - NVIDIA/cub#321: Update URLs from deprecated github.com to preferred github.io. Thanks to Lilo Huang (@lilohuang) for this contribution. # CUB 1.12.1 (CUDA Toolkit 11.4) CUB 1.12.1 is a trivial patch release that slightly changes the phrasing of a deprecation message. # CUB 1.12.0 (NVIDIA HPC SDK 21.3) ## Summary CUB 1.12.0 is a bugfix release accompanying the NVIDIA HPC SDK 21.3 and the CUDA Toolkit 11.4. Radix sort is now stable when both +0.0 and -0.0 are present in the input (they are treated as equivalent). Many compilation warnings and subtle overflow bugs were fixed in the device algorithms, including a long-standing bug that returned invalid temporary storage requirements when `num_items` was close to (but not exceeding) `INT32_MAX`. Support for Clang < 7.0 and MSVC < 2019 (aka 19.20/16.0/14.20) is now deprecated. ## Breaking Changes - NVIDIA/cub#256: Deprecate Clang < 7 and MSVC < 2019. ## New Features - NVIDIA/cub#218: Radix sort now treats -0.0 and +0.0 as equivalent for floating point types, which is required for the sort to be stable. Thanks to Andy Adinets for this contribution. ## Bug Fixes - NVIDIA/cub#247: Suppress newly triggered warnings in Clang. Thanks to Andrew Corrigan for this contribution. - NVIDIA/cub#249: Enable stricter warning flags. This fixes a number of outstanding issues: - NVIDIA/cub#221: Overflow in `temp_storage_bytes` when `num_items` close to (but not over) `INT32_MAX`. - NVIDIA/cub#228: CUB uses non-standard C++ extensions that break strict compilers. - NVIDIA/cub#257: Warning when compiling `GridEvenShare` with unsigned offsets. - NVIDIA/cub#258: Use correct `OffsetT` in `DispatchRadixSort::InitPassConfig`. Thanks to Felix Kallenborn for this contribution. - NVIDIA/cub#259: Remove some problematic `__forceinline__` annotations. ## Other Enhancements - NVIDIA/cub#123: Fix incorrect issue number in changelog. Thanks to Peet Whittaker for this contribution. # CUB 1.11.0 (CUDA Toolkit 11.3) ## Summary CUB 1.11.0 is a major release accompanying the CUDA Toolkit 11.3 release, providing bugfixes and performance enhancements. It includes a new `DeviceRadixSort` backend that improves performance by up to 2x on supported keys and hardware. Our CMake package and build system continue to see improvements with `add_subdirectory` support, installation rules, status messages, and other features that make CUB easier to use from CMake projects. The release includes several other bugfixes and modernizations, and received updates from 11 contributors. ## Breaking Changes - NVIDIA/cub#201: The intermediate accumulator type used when `DeviceScan` is invoked with different input/output types is now consistent with [P0571](https://wg21.link/P0571). This may produce different results for some edge cases when compared with earlier releases of CUB. ## New Features - NVIDIA/cub#204: Faster `DeviceRadixSort`, up to 2x performance increase for 32/64-bit keys on Pascal and up (SM60+). Thanks to Andy Adinets for this contribution. - Unroll loops in `BlockRadixRank` to improve performance for 32-bit keys by 1.5-2x on Clang CUDA. Thanks to Justin Lebar for this contribution. - NVIDIA/cub#200: Allow CUB to be added to CMake projects via `add_subdirectory`. - NVIDIA/cub#214: Optionally add install rules when included with CMake's `add_subdirectory`. Thanks to Kai Germaschewski for this contribution. ## Bug Fixes - NVIDIA/cub#215: Fix integer truncation in `AgentReduceByKey`, `AgentScan`, and `AgentSegmentFixup`. Thanks to Rory Mitchell for this contribution. - NVIDIA/cub#225: Fix compile-time regression when defining `CUB_NS_PREFIX` /`CUB_NS_POSTFIX` macro. Thanks to Elias Stehle for this contribution. - NVIDIA/cub#210: Fix some edge cases in `DeviceScan`: - Use values from the input when padding temporary buffers. This prevents custom functors from getting unexpected values. - Prevent integer truncation when using large indices via the `DispatchScan` layer. - Use timesliced reads/writes for types > 128 bytes. - NVIDIA/cub#217: Fix and add test for cmake package install rules. Thanks to Keith Kraus and Kai Germaschewski for testing and discussion. - NVIDIA/cub#170, NVIDIA/cub#233: Update CUDA version checks to behave on Clang CUDA and `nvc++`. Thanks to Artem Belevich, Andrew Corrigan, and David Olsen for these contributions. - NVIDIA/cub#220, NVIDIA/cub#216: Various fixes for Clang CUDA. Thanks to Andrew Corrigan for these contributions. - NVIDIA/cub#231: Fix signedness mismatch warnings in unit tests. - NVIDIA/cub#231: Suppress GPU deprecation warnings. - NVIDIA/cub#214: Use semantic versioning rules for our CMake package's compatibility checks. Thanks to Kai Germaschewski for this contribution. - NVIDIA/cub#214: Use `FindPackageHandleStandardArgs` to print standard status messages when our CMake package is found. Thanks to Kai Germaschewski for this contribution. - NVIDIA/cub#207: Fix `CubDebug` usage in `CachingDeviceAllocator::DeviceAllocate`. Thanks to Andreas Hehn for this contribution. - Fix documentation for `DevicePartition`. Thanks to ByteHamster for this contribution. - Clean up unused code in `DispatchScan`. Thanks to ByteHamster for this contribution. ## Other Enhancements - NVIDIA/cub#213: Remove tuning policies for unsupported hardware (`: `CUB_VERSION`, `CUB_VERSION_MAJOR`, `CUB_VERSION_MINOR`, `CUB_VERSION_SUBMINOR`, and `CUB_PATCH_NUMBER`. - Platform detection machinery: - ``: Detects the C++ standard dialect. - ``: host and device compiler detection. - ``: `CUB_DEPRECATED`. - `: Includes ``, ``, ``, ``, ``, `` - `cub::DeviceCount` and `cub::DeviceCountUncached`, caching abstractions for `cudaGetDeviceCount`. ## Other Enhancements - Lazily initialize the per-device CUDAattribute caches, because CUDA context creation is expensive and adds up with large CUDA binaries on machines with many GPUs. Thanks to the NVIDIA PyTorch team for bringing this to our attention. - Make `cub::SwitchDevice` avoid setting/resetting the device if the current device is the same as the target device. ## Bug Fixes - Add explicit failure parameter to CAS in the CUB attribute cache to workaround a GCC 4.8 bug. - Revert a change in reductions that changed the signedness of the `lane_id` variable to suppress a warning, as this introduces a bug in optimized device code. - Fix initialization in `cub::ExclusiveSum`. Thanks to Conor Hoekstra for this contribution. - Fix initialization of the `std::array` in the CUB attribute cache. - Fix `-Wsign-compare` warnings. Thanks to Elias Stehle for this contribution. - Fix `test_block_reduce.cu` to build without parameters. Thanks to Francis Lemaire for this contribution. - Add missing includes to `grid_even_share.cuh`. Thanks to Francis Lemaire for this contribution. - Add missing includes to `thread_search.cuh`. Thanks to Francis Lemaire for this contribution. - Add missing includes to `cub.cuh`. Thanks to Felix Kallenborn for this contribution. # CUB 1.9.8-1 (NVIDIA HPC SDK 20.3) ## Summary CUB 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 release. It contains modifications necessary to serve as the implementation of NVC++'s GPU-accelerated C++17 Parallel Algorithms. # CUB 1.9.8 (CUDA 11.0 Early Access) ## Summary CUB 1.9.8 is the first release of CUB to be officially supported and included in the CUDA Toolkit. When compiling CUB in C++11 mode, CUB now caches calls to CUDA attribute query APIs, which improves performance of these queries by 20x to 50x when they are called concurrently by multiple host threads. ## Enhancements - (C++11 or later) Cache calls to `cudaFuncGetAttributes` and `cudaDeviceGetAttribute` within `cub::PtxVersion` and `cub::SmVersion`. These CUDA APIs acquire locks to CUDA driver/runtime mutex and perform poorly under contention; with the caching, they are 20 to 50x faster when called concurrently. Thanks to Bilge Acun for bringing this issue to our attention. - `DispatchReduce` now takes an `OutputT` template parameter so that users can specify the intermediate type explicitly. - Radix sort tuning policies updates to fix performance issues for element types smaller than 4 bytes. ## Bug Fixes - Change initialization style from copy initialization to direct initialization (which is more permissive) in `AgentReduce` to allow a wider range of types to be used with it. - Fix bad signed/unsigned comparisons in `WarpReduce`. - Fix computation of valid lanes in warp-level reduction primitive to correctly handle the case where there are 0 input items per warp. # CUB 1.8.0 ## Summary CUB 1.8.0 introduces changes to the `cub::Shuffle*` interfaces. ## Breaking Changes - The interfaces of `cub::ShuffleIndex`, `cub::ShuffleUp`, and `cub::ShuffleDown` have been changed to allow for better computation of the PTX SHFL control constant for logical warps smaller than 32 threads. ## Bug Fixes - #112: Fix `cub::WarpScan`'s broadcast of warp-wide aggregate for logical warps smaller than 32 threads. # CUB 1.7.5 ## Summary CUB 1.7.5 adds support for radix sorting `__half` keys and improved sorting performance for 1 byte keys. It was incorporated into Thrust 1.9.2. ## Enhancements - Radix sort support for `__half` keys. - Radix sort tuning policy updates to improve 1 byte key performance. ## Bug Fixes - Syntax tweaks to mollify Clang. - #127: `cub::DeviceRunLengthEncode::Encode` returns incorrect results. - #128: 7-bit sorting passes fail for SM61 with large values. # CUB 1.7.4 ## Summary CUB 1.7.4 is a minor release that was incorporated into Thrust 1.9.1-2. ## Bug Fixes - #114: Can't pair non-trivially-constructible values in radix sort. - #115: `cub::WarpReduce` segmented reduction is broken in CUDA 9 for logical warp sizes smaller than 32. # CUB 1.7.3 ## Summary CUB 1.7.3 is a minor release. ## Bug Fixes - #110: `cub::DeviceHistogram` null-pointer exception bug for iterator inputs. # CUB 1.7.2 ## Summary CUB 1.7.2 is a minor release. ## Bug Fixes - #108: Device-wide reduction is now "run-to-run" deterministic for pseudo-associative reduction operators (like floating point addition). # CUB 1.7.1 ## Summary CUB 1.7.1 delivers improved radix sort performance on SM7x (Volta) GPUs and a number of bug fixes. ## Enhancements - Radix sort tuning policies updated for SM7x (Volta). ## Bug Fixes - #104: `uint64_t` `cub::WarpReduce` broken for CUB 1.7.0 on CUDA 8 and older. - #103: Can't mix Thrust from CUDA 9.0 and CUB. - #102: CUB pulls in `windows.h` which defines `min`/`max` macros that conflict with `std::min`/`std::max`. - #99: Radix sorting crashes NVCC on Windows 10 for SM52. - #98: cuda-memcheck: --tool initcheck failed with lineOfSight. - #94: Git clone size. - #93: Accept iterators for segment offsets. - #87: CUB uses anonymous unions which is not valid C++. - #44: Check for C++11 is incorrect for Visual Studio 2013. # CUB 1.7.0 ## Summary CUB 1.7.0 brings support for CUDA 9.0 and SM7x (Volta) GPUs. It is compatible with independent thread scheduling. It was incorporated into Thrust 1.9.0-5. ## Breaking Changes - Remove `cub::WarpAll` and `cub::WarpAny`. These functions served to emulate `__all` and `__any` functionality for SM1x devices, which did not have those operations. However, SM1x devices are now deprecated in CUDA, and the interfaces of these two functions are now lacking the lane-mask needed for collectives to run on SM7x and newer GPUs which have independent thread scheduling. ## Other Enhancements - Remove any assumptions of implicit warp synchronization to be compatible with SM7x's (Volta) independent thread scheduling. ## Bug Fixes - #86: Incorrect results with reduce-by-key. # CUB 1.6.4 ## Summary CUB 1.6.4 improves radix sorting performance for SM5x (Maxwell) and SM6x (Pascal) GPUs. ## Enhancements - Radix sort tuning policies updated for SM5x (Maxwell) and SM6x (Pascal) - 3.5B and 3.4B 32 byte keys/s on TitanX and GTX 1080, respectively. ## Bug Fixes - Restore fence work-around for scan (reduce-by-key, etc.) hangs in CUDA 8.5. - #65: `cub::DeviceSegmentedRadixSort` should allow inputs to have pointer-to-const type. - Mollify Clang device-side warnings. - Remove out-dated MSVC project files. # CUB 1.6.3 ## Summary CUB 1.6.3 improves support for Windows, changes `cub::BlockLoad`/`cub::BlockStore` interface to take the local data type, and enhances radix sort performance for SM6x (Pascal) GPUs. ## Breaking Changes - `cub::BlockLoad` and `cub::BlockStore` are now templated by the local data type, instead of the `Iterator` type. This allows for output iterators having `void` as their `value_type` (e.g. discard iterators). ## Other Enhancements - Radix sort tuning policies updated for SM6x (Pascal) GPUs - 6.2B 4 byte keys/s on GP100. - Improved support for Windows (warnings, alignment, etc). ## Bug Fixes - #74: `cub::WarpReduce` executes reduction operator for out-of-bounds items. - #72: `cub:InequalityWrapper::operator` should be non-const. - #71: `cub::KeyValuePair` won't work if `Key` has non-trivial constructor. - #69: cub::BlockStore::Store` doesn't compile if `OutputIteratorT::value_type` isn't `T`. - #68: `cub::TilePrefixCallbackOp::WarpReduce` doesn't permit PTX arch specialization. # CUB 1.6.2 (previously 1.5.5) ## Summary CUB 1.6.2 (previously 1.5.5) improves radix sort performance for SM6x (Pascal) GPUs. ## Enhancements - Radix sort tuning policies updated for SM6x (Pascal) GPUs. ## Bug Fixes - Fix AArch64 compilation of `cub::CachingDeviceAllocator`. # CUB 1.6.1 (previously 1.5.4) ## Summary CUB 1.6.1 (previously 1.5.4) is a minor release. ## Bug Fixes - Fix radix sorting bug introduced by scan refactorization. # CUB 1.6.0 (previously 1.5.3) ## Summary CUB 1.6.0 changes the scan and reduce interfaces. Exclusive scans now accept an "initial value" instead of an "identity value". Scans and reductions now support differing input and output sequence types. Additionally, many bugs have been fixed. ## Breaking Changes - Device/block/warp-wide exclusive scans have been revised to now accept an "initial value" (instead of an "identity value") for seeding the computation with an arbitrary prefix. - Device-wide reductions and scans can now have input sequence types that are different from output sequence types (as long as they are convertible). ## Other Enhancements - Reduce repository size by moving the doxygen binary to doc repository. - Minor reduction in `cub::BlockScan` instruction counts. ## Bug Fixes - Issue #55: Warning in `cub/device/dispatch/dispatch_reduce_by_key.cuh`. - Issue #59: `cub::DeviceScan::ExclusiveSum` can't prefix sum of float into double. - Issue #58: Infinite loop in `cub::CachingDeviceAllocator::NearestPowerOf`. - Issue #47: `cub::CachingDeviceAllocator` needs to clean up CUDA global error state upon successful retry. - Issue #46: Very high amount of needed memory from the `cub::DeviceHistogram::HistogramEven`. - Issue #45: `cub::CachingDeviceAllocator` fails with debug output enabled # CUB 1.5.2 ## Summary CUB 1.5.2 enhances `cub::CachingDeviceAllocator` and improves scan performance for SM5x (Maxwell). ## Enhancements - Improved medium-size scan performance on SM5x (Maxwell). - Refactored `cub::CachingDeviceAllocator`: - Now spends less time locked. - Uses C++11's `std::mutex` when available. - Failure to allocate a block from the runtime will retry once after freeing cached allocations. - Now respects max-bin, fixing an issue where blocks in excess of max-bin were still being retained in the free cache. ## Bug fixes: - Fix for generic-type reduce-by-key `cub::WarpScan` for SM3x and newer GPUs. # CUB 1.5.1 ## Summary CUB 1.5.1 is a minor release. ## Bug Fixes - Fix for incorrect `cub::DeviceRadixSort` output for some small problems on SM52 (Mawell) GPUs. - Fix for macro redefinition warnings when compiling `thrust::sort`. # CUB 1.5.0 CUB 1.5.0 introduces segmented sort and reduction primitives. ## New Features: - Segmented device-wide operations for device-wide sort and reduction primitives. ## Bug Fixes: - #36: `cub::ThreadLoad` generates compiler errors when loading from pointer-to-const. - #29: `cub::DeviceRadixSort::SortKeys` yields compiler errors. - #26: Misaligned address after `cub::DeviceRadixSort::SortKeys`. - #25: Fix for incorrect results and crashes when radix sorting 0-length problems. - Fix CUDA 7.5 issues on SM52 GPUs with SHFL-based warp-scan and warp-reduction on non-primitive data types (e.g. user-defined structs). - Fix small radix sorting problems where 0 temporary bytes were required and users code was invoking `malloc(0)` on some systems where that returns `NULL`. CUB assumed the user was asking for the size again and not running the sort. # CUB 1.4.1 ## Summary CUB 1.4.1 is a minor release. ## Enhancements - Allow `cub::DeviceRadixSort` and `cub::BlockRadixSort` on bool types. ## Bug Fixes - Fix minor CUDA 7.0 performance regressions in `cub::DeviceScan` and `cub::DeviceReduceByKey`. - Remove requirement for callers to define the `CUB_CDP` macro when invoking CUB device-wide rountines using CUDA dynamic parallelism. - Fix headers not being included in the proper order (or missing includes) for some block-wide functions. # CUB 1.4.0 ## Summary CUB 1.4.0 adds `cub::DeviceSpmv`, `cub::DeviceRunLength::NonTrivialRuns`, improves `cub::DeviceHistogram`, and introduces support for SM5x (Maxwell) GPUs. ## New Features: - `cub::DeviceSpmv` methods for multiplying sparse matrices by dense vectors, load-balanced using a merge-based parallel decomposition. - `cub::DeviceRadixSort` sorting entry-points that always return the sorted output into the specified buffer, as opposed to the `cub::DoubleBuffer` in which it could end up in either buffer. - `cub::DeviceRunLengthEncode::NonTrivialRuns` for finding the starting offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in a given sequence. Useful for top-down partitioning algorithms like MSD sorting of very-large keys. ## Other Enhancements - Support and performance tuning for SM5x (Maxwell) GPUs. - Updated cub::DeviceHistogram implementation that provides the same "histogram-even" and "histogram-range" functionality as IPP/NPP. Provides extremely fast and, perhaps more importantly, very uniform performance response across diverse real-world datasets, including pathological (homogeneous) sample distributions. # CUB 1.3.2 ## Summary CUB 1.3.2 is a minor release. ## Bug Fixes - Fix `cub::DeviceReduce` where reductions of small problems (small enough to only dispatch a single thread block) would run in the default stream (stream zero) regardless of whether an alternate stream was specified. # CUB 1.3.1 ## Summary CUB 1.3.1 is a minor release. ## Bug Fixes - Workaround for a benign WAW race warning reported by cuda-memcheck in `cub::BlockScan` specialized for `BLOCK_SCAN_WARP_SCANS` algorithm. - Fix bug in `cub::DeviceRadixSort` where the algorithm may sort more key bits than the caller specified (up to the nearest radix digit). - Fix for ~3% `cub::DeviceRadixSort` performance regression on SM2x (Fermi) and SM3x (Kepler) GPUs. # CUB 1.3.0 ## Summary CUB 1.3.0 improves how thread blocks are expressed in block- and warp-wide primitives and adds an enhanced version of `cub::WarpScan`. ## Breaking Changes - CUB's collective (block-wide, warp-wide) primitives underwent a minor interface refactoring: - To provide the appropriate support for multidimensional thread blocks, The interfaces for collective classes are now template-parameterized by X, Y, and Z block dimensions (with `BLOCK_DIM_Y` and `BLOCK_DIM_Z` being optional, and `BLOCK_DIM_X` replacing `BLOCK_THREADS`). Furthermore, the constructors that accept remapped linear thread-identifiers have been removed: all primitives now assume a row-major thread-ranking for multidimensional thread blocks. - To allow the host program (compiled by the host-pass) to accurately determine the device-specific storage requirements for a given collective (compiled for each device-pass), the interfaces for collective classes are now (optionally) template-parameterized by the desired PTX compute capability. This is useful when aliasing collective storage to shared memory that has been allocated dynamically by the host at the kernel call site. - Most CUB programs having typical 1D usage should not require any changes to accomodate these updates. ## New Features - Added "combination" `cub::WarpScan` methods for efficiently computing both inclusive and exclusive prefix scans (and sums). ## Bug Fixes - Fix for bug in `cub::WarpScan` (which affected `cub::BlockScan` and `cub::DeviceScan`) where incorrect results (e.g., NAN) would often be returned when parameterized for floating-point types (fp32, fp64). - Workaround for ptxas error when compiling with with -G flag on Linux (for debug instrumentation). - Fixes for certain scan scenarios using custom scan operators where code compiled for SM1x is run on newer GPUs of higher compute-capability: the compiler could not tell which memory space was being used collective operations and was mistakenly using global ops instead of shared ops. # CUB 1.2.3 ## Summary CUB 1.2.3 is a minor release. ## Bug Fixes - Fixed access violation bug in `cub::DeviceReduce::ReduceByKey` for non-primitive value types. - Fixed code-snippet bug in `ArgIndexInputIteratorT` documentation. # CUB 1.2.2 ## Summary CUB 1.2.2 adds a new variant of `cub::BlockReduce` and MSVC project solections for examples. ## New Features - MSVC project solutions for device-wide and block-wide examples - New algorithmic variant of cub::BlockReduce for improved performance when using commutative operators (e.g., numeric addition). ## Bug Fixes - Inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly. # CUB 1.2.0 ## Summary CUB 1.2.0 adds `cub::DeviceReduce::ReduceByKey` and `cub::DeviceReduce::RunLengthEncode` and support for CUDA 6.0. ## New Features - `cub::DeviceReduce::ReduceByKey`. - `cub::DeviceReduce::RunLengthEncode`. ## Other Enhancements - Improved `cub::DeviceScan`, `cub::DeviceSelect`, `cub::DevicePartition` performance. - Documentation and testing: - Added performance-portability plots for many device-wide primitives. - Explain that iterator (in)compatibilities with CUDA 5.0 (and older) and Thrust 1.6 (and older). - Revised the operation of temporary tile status bookkeeping for `cub::DeviceScan` (and similar) to be safe for current code run on future platforms (now uses proper fences). ## Bug Fixes - Fix `cub::DeviceScan` bug where Windows alignment disagreements between host and device regarding user-defined data types would corrupt tile status. - Fix `cub::BlockScan` bug where certain exclusive scans on custom data types for the `BLOCK_SCAN_WARP_SCANS` variant would return incorrect results for the first thread in the block. - Added workaround to make `cub::TexRefInputIteratorT` work with CUDA 6.0. # CUB 1.1.1 ## Summary CUB 1.1.1 introduces texture and cache modifier iterators, descending sorting, `cub::DeviceSelect`, `cub::DevicePartition`, `cub::Shuffle*`, and `cub::MaxSMOccupancy`. Additionally, scan and sort performance for older GPUs has been improved and many bugs have been fixed. ## Breaking Changes - Refactored block-wide I/O (`cub::BlockLoad` and `cub::BlockStore`), removing cache-modifiers from their interfaces. `cub::CacheModifiedInputIterator` and `cub::CacheModifiedOutputIterator` should now be used with `cub::BlockLoad` and `cub::BlockStore` to effect that behavior. ## New Features - `cub::TexObjInputIterator`, `cub::TexRefInputIterator`, `cub::CacheModifiedInputIterator`, and `cub::CacheModifiedOutputIterator` types for loading & storing arbitrary types through the cache hierarchy. They are compatible with Thrust. - Descending sorting for `cub::DeviceRadixSort` and `cub::BlockRadixSort`. - Min, max, arg-min, and arg-max operators for `cub::DeviceReduce`. - `cub::DeviceSelect` (select-unique, select-if, and select-flagged). - `cub::DevicePartition` (partition-if, partition-flagged). - Generic `cub::ShuffleUp`, `cub::ShuffleDown`, and `cub::ShuffleIndex` for warp-wide communication of arbitrary data types (SM3x and up). - `cub::MaxSmOccupancy` for accurately determining SM occupancy for any given kernel function pointer. ## Other Enhancements - Improved `cub::DeviceScan` and `cub::DeviceRadixSort` performance for older GPUs (SM1x to SM3x). - Renamed device-wide `stream_synchronous` param to `debug_synchronous` to avoid confusion about usage. - Documentation improvements: - Added simple examples of device-wide methods. - Improved doxygen documentation and example snippets. - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform). ## Bug Fixes - Fix misc `cub::DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM1x. - SHFL-based scans and reductions produced incorrect results for multi-word types (size > 4B) on Linux. - For `cub::WarpScan`-based scans, not all threads in the first warp were entering the prefix callback functor. - `cub::DeviceRadixSort` had a race condition with key-value pairs for pre-SM35 architectures. - `cub::DeviceRadixSor` bitfield-extract behavior with long keys on 64-bit Linux was incorrect. - `cub::BlockDiscontinuity` failed to compile for types other than `int32_t`/`uint32_t`. - CUDA Dynamic Parallelism (CDP, e.g. device-callable) versions of device-wide methods now report the same temporary storage allocation size requirement as their host-callable counterparts. # CUB 1.0.2 ## Summary CUB 1.0.2 is a minor release. ## Bug Fixes - Corrections to code snippet examples for `cub::BlockLoad`, `cub::BlockStore`, and `cub::BlockDiscontinuity`. - Cleaned up unnecessary/missing header includes. You can now safely include a specific .cuh (instead of `cub.cuh`). - Bug/compilation fixes for `cub::BlockHistogram`. # CUB 1.0.1 ## Summary CUB 1.0.1 adds `cub::DeviceRadixSort` and `cub::DeviceScan`. Numerous other performance and correctness fixes and included. ## Breaking Changes - New collective interface idiom (specialize/construct/invoke). ## New Features - `cub::DeviceRadixSort`. Implements short-circuiting for homogenous digit passes. - `cub::DeviceScan`. Implements single-pass "adaptive-lookback" strategy. ## Other Enhancements - Significantly improved documentation (with example code snippets). - More extensive regression test suit for aggressively testing collective variants. - Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types). - Improved support for SM3x SHFL (collective ops now use SHFL for types larger than 32 bits). - Better code generation for 64-bit addressing within `cub::BlockLoad`/`cub::BlockStore`. - `cub::DeviceHistogram` now supports histograms of arbitrary bins. - Updates to accommodate CUDA 5.5 dynamic parallelism. ## Bug Fixes - Workarounds for SM10 codegen issues in uncommonly-used `cub::WarpScan`/`cub::WarpReduce` specializations. # CUB 0.9.4 ## Summary CUB 0.9.3 is a minor release. ## Enhancements - Various documentation updates and corrections. ## Bug Fixes - Fixed compilation errors for SM1x. - Fixed compilation errors for some WarpScan entrypoints on SM3x and up. # CUB 0.9.3 ## Summary CUB 0.9.3 adds histogram algorithms and work management utility descriptors. ## New Features - `cub::DevicHistogram256`. - `cub::BlockHistogram256`. - `cub::BlockScan` algorithm variant `BLOCK_SCAN_RAKING_MEMOIZE`, which trades more register consumption for less shared memory I/O. - `cub::GridQueue`, `cub::GridEvenShare`, work management utility descriptors. ## Other Enhancements - Updates to `cub::BlockRadixRank` to use `cub::BlockScan`, which improves performance on SM3x by using SHFL. - Allow types other than builtin types to be used in `cub::WarpScan::*Sum` methods if they only have `operator+` overloaded. Previously they also required to support assignment from `int(0)`. - Update `cub::BlockReduce`'s `BLOCK_REDUCE_WARP_REDUCTIONS` algorithm to work even when block size is not an even multiple of warp size. - Refactoring of `cub::DeviceAllocator` interface and `cub::CachingDeviceAllocator` implementation. # CUB 0.9.2 ## Summary CUB 0.9.2 adds `cub::WarpReduce`. ## New Features - `cub::WarpReduce`, which uses the SHFL instruction when applicable. `cub::BlockReduce` now uses this `cub::WarpReduce` instead of implementing its own. ## Enhancements - Documentation updates and corrections. ## Bug Fixes - Fixes for 64-bit Linux compilation warnings and errors. # CUB 0.9.1 ## Summary CUB 0.9.1 is a minor release. ## Bug Fixes - Fix for ambiguity in `cub::BlockScan::Reduce` between generic reduction and summation. Summation entrypoints are now called `::Sum()`, similar to the convention in `cub::BlockScan`. - Small edits to documentation and download tracking. # CUB 0.9.0 ## Summary Initial preview release. CUB is the first durable, high-performance library of cooperative block-level, warp-level, and thread-level primitives for CUDA kernel programming. cub-2.0.1/CMakeLists.txt000066400000000000000000000073521434614775400150510ustar00rootroot00000000000000# 3.15 is the minimum. # 3.17 for NVC++. # 3.18.3 for C++17 + CUDA. cmake_minimum_required(VERSION 3.15) # Remove this when we use the new CUDA_ARCHITECTURES properties. if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) cmake_policy(SET CMP0104 OLD) endif() # CXX is only needed for AppendOptionIfAvailable. project(CUB NONE) # Determine whether CUB is the top-level project or included into # another project via add_subdirectory(). if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") set(CUB_TOPLEVEL_PROJECT ON) else() set(CUB_TOPLEVEL_PROJECT OFF) endif() # This must be done before any languages are enabled: if (CUB_TOPLEVEL_PROJECT) include(cmake/CubCompilerHacks.cmake) endif() # This must appear after our Compiler Hacks or else CMake will delete the cache # and reconfigure from scratch. # This must also appear before the installation rules, as it is required by the # GNUInstallDirs CMake module. enable_language(CXX) # Thrust has its own copy of CUB install rules to handle packaging usecases # where we want to install CUB headers but aren't actually building anything. # In these cases the add_subdirectory(dependencies/cub) line in Thrust won't get # called so we can't rely on CUB providing its own rules. if (NOT CUB_IN_THRUST) option(CUB_ENABLE_INSTALL_RULES "Enable installation of CUB" ${CUB_TOPLEVEL_PROJECT}) if (CUB_ENABLE_INSTALL_RULES) include(cmake/CubInstallRules.cmake) endif() endif() # Support adding CUB to a parent project via add_subdirectory. # See examples/cmake/add_subdir/CMakeLists.txt for details. if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST) include(cmake/CubAddSubdir.cmake) return() endif() option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) # This is needed for NVCXX QA, which requires a static set of executable names. # Only a single dialect may be enabled when this is off. option(CUB_ENABLE_CPP_DIALECT_IN_NAMES "Include C++ dialect information in target/object/etc names." ON ) mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES) # This option is only used when CUB is built stand-alone; otherwise the Thrust # option has the same effect. if (NOT CUB_IN_THRUST) option(CUB_IGNORE_DEPRECATED_API "Suppress warnings about deprecated Thrust/CUB API." OFF ) endif() # Check if we're actually building anything before continuing. If not, no need # to search for deps, etc. This is a common approach for packagers that just # need the install rules. See GH issue NVIDIA/thrust#1211. if (NOT (CUB_ENABLE_HEADER_TESTING OR CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES)) return() endif() include(cmake/AppendOptionIfAvailable.cmake) include(cmake/CubBuildCompilerTargets.cmake) include(cmake/CubBuildTargetList.cmake) include(cmake/CubCudaConfig.cmake) include(cmake/CubUtilities.cmake) if ("" STREQUAL "${CMAKE_BUILD_TYPE}") set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE) set_property( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel ) endif () set(CMAKE_CXX_EXTENSIONS OFF) # Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside # Thrust targets when building as part of Thrust. set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib") set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin") cub_build_target_list() if (CUB_ENABLE_HEADER_TESTING) include(cmake/CubHeaderTesting.cmake) endif() # Both testing and examples use ctest if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES) include(CTest) enable_testing() endif() if (CUB_ENABLE_TESTING) add_subdirectory(test) endif() if (CUB_ENABLE_EXAMPLES) add_subdirectory(examples) endif() cub-2.0.1/CODE_OF_CONDUCT.md000066400000000000000000000073461434614775400151130ustar00rootroot00000000000000 # Code of Conduct ## Overview This document defines the Code of Conduct followed and enforced for NVIDIA C++ Core Compute Libraries. ### Intended Audience * Community * Developers * Project Leads ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: - Using welcoming and inclusive language. - Being respectful of differing viewpoints and experiences. - Gracefully accepting constructive criticism. - Focusing on what is best for the community. - Showing empathy towards other community members. Examples of unacceptable behavior by participants include: - The use of sexualized language or imagery and unwelcome sexual attention or advances. - Trolling, insulting/derogatory comments, and personal or political attacks. - Public or private harassment. - Publishing others’ private information, such as a physical or electronic address, without explicit permission. - Other conduct which could reasonably be considered inappropriate. ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership. ## Attribution This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was adapted from the [Contributor Covenant version 1.4]. Please see this [FAQ] for answers to common questions about this Code of Conduct. ## Contact Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters. [cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com [FAQ]: https://www.contributor-covenant.org/faq [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/ [Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html cub-2.0.1/CONTRIBUTING.md000066400000000000000000000100731434614775400145340ustar00rootroot00000000000000# Table of Contents 1. [Contributing to CUB](#contributing-to-cub) 1. [CMake Options](#cmake-options) 1. [Development Model](#development-model) # Contributing to CUB CUB uses Github to manage all open-source development, including bug tracking, pull requests, and design discussions. CUB is tightly coupled to the Thrust project, and a compatible version of Thrust is required when working on the development version of CUB. To setup a CUB development branch, it is recommended to recursively clone the Thrust repository and use the CUB submodule at `dependencies/cub` to stage changes. CUB's tests and examples can be built by configuring Thrust with the CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`. This process is described in more detail in Thrust's [CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html). The CMake options in the following section may be used to customize CUB's build process. Note that some of these are controlled by Thrust for compatibility and may not have an effect when building CUB through the Thrust build system. This is pointed out in the documentation below where applicable. # CMake Options A CUB build is configured using CMake options. These may be passed to CMake using ``` cmake -D= [Thrust or CUB project source root] ``` or configured interactively with the `ccmake` or `cmake-gui` interfaces. The configuration options for CUB are: - `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}` - Standard CMake build option. Default: `RelWithDebInfo` - `CUB_ENABLE_HEADER_TESTING={ON, OFF}` - Whether to test compile public headers. Default is `ON`. - `CUB_ENABLE_TESTING={ON, OFF}` - Whether to build unit tests. Default is `ON`. - `CUB_ENABLE_EXAMPLES={ON, OFF}` - Whether to build examples. Default is `ON`. - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}` - Setting this has no effect when building CUB as a component of Thrust. See Thrust's dialect options, which CUB will inherit. - Toggle whether a specific C++ dialect will be targeted. - Multiple dialects may be targeted in a single build. - Possible values of `XX` are `{11, 14, 17}`. - By default, only C++14 is enabled. - `CUB_ENABLE_COMPUTE_XX={ON, OFF}` - Setting this has no effect when building CUB as a component of Thrust. See Thrust's architecture options, which CUB will inherit. - Controls the targeted CUDA architecture(s) - Multiple options may be selected when using NVCC as the CUDA compiler. - Valid values of `XX` are: `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}` - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`: - `CUB_ENABLE_COMPUTE_FUTURE={ON, OFF}` - Setting this has no effect when building CUB as a component of Thrust. See Thrust's architecture options, which CUB will inherit. - If enabled, CUDA objects will target the most recent virtual architecture in addition to the real architectures specified by the `CUB_ENABLE_COMPUTE_XX` options. - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`: - `CUB_DISABLE_ARCH_BY_DEFAULT={ON, OFF}` - Setting this has no effect when building CUB as a component of Thrust. See Thrust's architecture options, which CUB will inherit. - When `ON`, all `CUB_ENABLE_COMPUTE_*` options are initially `OFF`. - Default: `OFF` (meaning all architectures are enabled by default) - `CUB_ENABLE_TESTS_WITH_RDC={ON, OFF}` - Whether to enable Relocatable Device Code when building tests. Default is `OFF`. - `CUB_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}` - Whether to enable Relocatable Device Code when building examples. Default is `OFF`. - `CUB_ENABLE_INSTALL_RULES={ON, OFF}` - Setting this has no effect when building CUB as a component of Thrust. See Thrust's `THRUST_INSTALL_CUB_HEADERS` option, which controls this behavior. - If true, installation rules will be generated for CUB. Default is `ON` when building CUB alone, and `OFF` when CUB is a subproject added via CMake's `add_subdirectory`. # Development Model CUB follows the same development model as Thrust, described [here](https://nvidia.github.io/thrust/releases/versioning.html). cub-2.0.1/LICENSE.TXT000066400000000000000000000030521434614775400137650ustar00rootroot00000000000000Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cub-2.0.1/README.md000066400000000000000000000242101434614775400135600ustar00rootroot00000000000000

About CUB

CUB provides state-of-the-art, reusable software components for every layer of the CUDA programming model: - [Device-wide primitives](https://nvlabs.github.io/cub/group___device_module.html) - Sort, prefix scan, reduction, histogram, etc. - Compatible with CUDA dynamic parallelism - [Block-wide "collective" primitives](https://nvlabs.github.io/cub/group___block_module.html) - I/O, sort, prefix scan, reduction, histogram, etc. - Compatible with arbitrary thread block sizes and types - [Warp-wide "collective" primitives](https://nvlabs.github.io/cub/group___warp_module.html) - Warp-wide prefix scan, reduction, etc. - Safe and architecture-specific - [Thread and resource utilities](https://nvlabs.github.io/cub/group___util_io.html) - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. ![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.io/cub/cub_overview.png) CUB is included in the NVIDIA HPC SDK and the CUDA Toolkit. We recommend the [CUB Project Website](http://nvlabs.github.io/cub) for further information and examples.

A Simple Example

```C++ #include // Block-sorting CUDA kernel __global__ void BlockSortKernel(int *d_in, int *d_out) { using namespace cub; // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads // owning 16 integer items each typedef BlockRadixSort BlockRadixSort; typedef BlockLoad BlockLoad; typedef BlockStore BlockStore; // Allocate shared memory __shared__ union { typename BlockRadixSort::TempStorage sort; typename BlockLoad::TempStorage load; typename BlockStore::TempStorage store; } temp_storage; int block_offset = blockIdx.x * (128 * 16); // OffsetT for this block's ment // Obtain a segment of 2048 consecutive keys that are blocked across threads int thread_keys[16]; BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys); __syncthreads(); // Collectively sort the keys BlockRadixSort(temp_storage.sort).Sort(thread_keys); __syncthreads(); // Store the sorted segment BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys); } ``` Each thread block uses `cub::BlockRadixSort` to collectively sort its own input segment. The class is specialized by the data type being sorted, by the number of threads per block, by the number of keys per thread, and implicitly by the targeted compilation architecture. The `cub::BlockLoad` and `cub::BlockStore` classes are similarly specialized. Furthermore, to provide coalesced accesses to device memory, these primitives are configured to access memory using a striped access pattern (where consecutive threads simultaneously access consecutive items) and then transpose the keys into a [blocked arrangement](index.html#sec4sec3) of elements across threads. Once specialized, these classes expose opaque `TempStorage` member types. The thread block uses these storage types to statically allocate the union of shared memory needed by the thread block. (Alternatively these storage types could be aliased to global memory allocations).

Supported Compilers

CUB is regularly tested using the specified versions of the following compilers. Unsupported versions may emit deprecation warnings, which can be silenced by defining CUB_IGNORE_DEPRECATED_COMPILER during compilation. - NVCC 11.0+ - GCC 5+ - Clang 7+ - MSVC 2019+ (19.20/16.0/14.20)

Releases

CUB is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition to GitHub. See the [changelog](CHANGELOG.md) for details about specific releases. | CUB Release | Included In | | ------------------------- | --------------------------------------- | | 2.0.0 | TBD | | 1.17.1 | TBD | | 1.17.0 | TBD | | 1.16.0 | TBD | | 1.15.0 | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 | | 1.14.0 | NVIDIA HPC SDK 21.9 | | 1.13.1 | CUDA Toolkit 11.5 | | 1.13.0 | NVIDIA HPC SDK 21.7 | | 1.12.1 | CUDA Toolkit 11.4 | | 1.12.0 | NVIDIA HPC SDK 21.3 | | 1.11.0 | CUDA Toolkit 11.3 | | 1.10.0 | NVIDIA HPC SDK 20.9 & CUDA Toolkit 11.2 | | 1.9.10-1 | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1 | | 1.9.10 | NVIDIA HPC SDK 20.5 | | 1.9.9 | CUDA Toolkit 11.0 | | 1.9.8-1 | NVIDIA HPC SDK 20.3 | | 1.9.8 | CUDA Toolkit 11.0 Early Access | | 1.9.8 | CUDA 11.0 Early Access | | 1.8.0 | | | 1.7.5 | Thrust 1.9.2 | | 1.7.4 | Thrust 1.9.1-2 | | 1.7.3 | | | 1.7.2 | | | 1.7.1 | | | 1.7.0 | Thrust 1.9.0-5 | | 1.6.4 | | | 1.6.3 | | | 1.6.2 (previously 1.5.5) | | | 1.6.1 (previously 1.5.4) | | | 1.6.0 (previously 1.5.3) | | | 1.5.2 | | | 1.5.1 | | | 1.5.0 | | | 1.4.1 | | | 1.4.0 | | | 1.3.2 | | | 1.3.1 | | | 1.3.0 | | | 1.2.3 | | | 1.2.2 | | | 1.2.0 | | | 1.1.1 | | | 1.0.2 | | | 1.0.1 | | | 0.9.4 | | | 0.9.2 | | | 0.9.1 | | | 0.9.0 | |

Development Process

CUB and Thrust depend on each other. It is recommended to clone Thrust and build CUB as a component of Thrust. CUB uses the [CMake build system](https://cmake.org/) to build unit tests, examples, and header tests. To build CUB as a developer, the following recipe should be followed: ``` # Clone Thrust and CUB from Github. CUB is located in Thrust's # `dependencies/cub` submodule. git clone --recursive https://github.com/NVIDIA/thrust.git cd thrust # Create build directory: mkdir build cd build # Configure -- use one of the following: cmake -DTHRUST_INCLUDE_CUB_CMAKE=ON .. # Command line interface. ccmake -DTHRUST_INCLUDE_CUB_CMAKE=ON .. # ncurses GUI (Linux only) cmake-gui # Graphical UI, set source/build directories and options in the app # Build: cmake --build . -j # invokes make (or ninja, etc) # Run tests and examples: ctest ``` By default, the C++14 standard is targeted, but this can be changed in CMake. More information on configuring your CUB build and creating a pull request is found in [CONTRIBUTING.md](CONTRIBUTING.md).

Open Source License

CUB is available under the "New BSD" open-source license: ``` Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` cub-2.0.1/cmake/000077500000000000000000000000001434614775400133625ustar00rootroot00000000000000cub-2.0.1/cmake/AppendOptionIfAvailable.cmake000066400000000000000000000004121434614775400210410ustar00rootroot00000000000000include_guard(GLOBAL) include(CheckCXXCompilerFlag) macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST) string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR) check_cxx_compiler_flag(${_FLAG} ${_VAR}) if (${${_VAR}}) list(APPEND ${_LIST} ${_FLAG}) endif () endmacro () cub-2.0.1/cmake/CubAddSubdir.cmake000066400000000000000000000002051434614775400166540ustar00rootroot00000000000000find_package(CUB REQUIRED CONFIG NO_DEFAULT_PATH # Only check the explicit path in HINTS: HINTS "${CMAKE_CURRENT_LIST_DIR}/.." ) cub-2.0.1/cmake/CubBuildCompilerTargets.cmake000066400000000000000000000116771434614775400211160ustar00rootroot00000000000000# # This file defines the `cub_build_compiler_targets()` function, which # creates the following interface targets: # # cub.compiler_interface # - Interface target providing compiler-specific options needed to build # Thrust's tests, examples, etc. function(cub_build_compiler_targets) set(cxx_compile_definitions) set(cxx_compile_options) if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") append_option_if_available("/W4" cxx_compile_options) append_option_if_available("/WX" cxx_compile_options) # Suppress overly-pedantic/unavoidable warnings brought in with /W4: # C4324: structure was padded due to alignment specifier append_option_if_available("/wd4324" cxx_compile_options) # C4127: conditional expression is constant # This can be fixed with `if constexpr` when available, but there's no way # to silence these pre-C++17. # TODO We should have per-dialect interface targets so we can leave these # warnings enabled on C++17: append_option_if_available("/wd4127" cxx_compile_options) # C4505: unreferenced local function has been removed # The CUDA `host_runtime.h` header emits this for # `__cudaUnregisterBinaryUtil`. append_option_if_available("/wd4505" cxx_compile_options) # C4706: assignment within conditional expression # MSVC doesn't provide an opt-out for this warning when the assignment is # intentional. Clang will warn for these, but suppresses the warning when # double-parentheses are used around the assignment. We'll let Clang catch # unintentional assignments and suppress all such warnings on MSVC. append_option_if_available("/wd4706" cxx_compile_options) # Some tests require /bigobj to fit everything into their object files: append_option_if_available("/bigobj" cxx_compile_options) else() append_option_if_available("-Werror" cxx_compile_options) append_option_if_available("-Wall" cxx_compile_options) append_option_if_available("-Wextra" cxx_compile_options) append_option_if_available("-Winit-self" cxx_compile_options) append_option_if_available("-Woverloaded-virtual" cxx_compile_options) append_option_if_available("-Wcast-qual" cxx_compile_options) append_option_if_available("-Wpointer-arith" cxx_compile_options) append_option_if_available("-Wunused-local-typedef" cxx_compile_options) append_option_if_available("-Wvla" cxx_compile_options) # Disable GNU extensions (flag is clang only) append_option_if_available("-Wgnu" cxx_compile_options) # Calling a variadic macro with zero args is a GNU extension until C++20, # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this # is a real problem worth fixing. append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options) # This complains about functions in CUDA system headers when used with nvcc. append_option_if_available("-Wno-unused-function" cxx_compile_options) endif() if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3) # GCC 7.3 complains about name mangling changes due to `noexcept` # becoming part of the type system; we don't care. append_option_if_available("-Wno-noexcept-type" cxx_compile_options) endif() endif() if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}") # Disable warning that inlining is inhibited by compiler thresholds. append_option_if_available("-diag-disable=11074" cxx_compile_options) append_option_if_available("-diag-disable=11076" cxx_compile_options) endif() if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") list(APPEND cxx_compile_options -Mnodaz) # TODO: Managed memory is currently not supported on windows with WSL list(APPEND cxx_compile_options -gpu=nomanaged) endif() add_library(cub.compiler_interface INTERFACE) foreach (cxx_option IN LISTS cxx_compile_options) target_compile_options(cub.compiler_interface INTERFACE $<$:${cxx_option}> $<$:${cxx_option}> # Only use -Xcompiler with NVCC, not NVC++. # # CMake can't split genexs, so this can't be formatted better :( # This is: # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt: $<$:-Xcompiler=${cxx_option}> ) endforeach() # Add these for both CUDA and CXX targets: target_compile_definitions(cub.compiler_interface INTERFACE ${cxx_compile_definitions} ) # Promote warnings and display diagnostic numbers for nvcc: target_compile_options(cub.compiler_interface INTERFACE # If using CUDA w/ NVCC... # Display diagnostic numbers. $<$:-Xcudafe=--display_error_number> # Promote warnings. $<$:-Xcudafe=--promote_warnings> # Don't complain about deprecated GPU targets. $<$:-Wno-deprecated-gpu-targets> ) endfunction() cub-2.0.1/cmake/CubBuildTargetList.cmake000066400000000000000000000233201434614775400200600ustar00rootroot00000000000000# This file provides utilities for building and working with CUB # configuration targets. # # CUB_TARGETS # - Built by the calling the `cub_build_target_list()` function. # - Each item is the name of a CUB interface target that is configured for a # certain build configuration. Currently only C++ standard dialect is # considered. # # cub_build_target_list() # - Creates the CUB_TARGETS list. # # The following functions can be used to test/set metadata on a CUB target: # # cub_get_target_property( ) # - Checks the ${prop} target property on CUB target ${target_name} # and sets the ${prop_var} variable in the caller's scope. # - is any valid cmake identifier. # - is the name of a CUB target. # - is one of the following: # - DIALECT: The C++ dialect. Valid values: 11, 14, 17, 20. # - PREFIX: A unique prefix that should be used to name all # targets/tests/examples that use this configuration. # # cub_get_target_properties() # - Defines ${target_name}_${prop} in the caller's scope, for `prop` in: # {DIALECT, PREFIX}. See above for details. # # cub_clone_target_properties( ) # - Set the {DIALECT, PREFIX} metadata on ${dst_target} to match # ${src_target}. See above for details. # - This *MUST* be called on any targets that link to another CUB target # to ensure that dialect information is updated correctly, e.g. # `cub_clone_target_properties(${my_cub_test} ${some_cub_target})` # Dialects: set(CUB_CPP_DIALECT_OPTIONS 11 14 17 20 CACHE INTERNAL "C++ dialects supported by CUB." FORCE ) define_property(TARGET PROPERTY _CUB_DIALECT BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17." FULL_DOCS "A target's C++ dialect: 11, 14, or 17." ) define_property(TARGET PROPERTY _CUB_PREFIX BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp14'." FULL_DOCS "A prefix describing the config, eg. 'cub.cpp14'." ) function(cub_set_target_properties target_name dialect prefix) set_target_properties(${target_name} PROPERTIES _CUB_DIALECT ${dialect} _CUB_PREFIX ${prefix} ) get_target_property(type ${target_name} TYPE) if (NOT ${type} STREQUAL "INTERFACE_LIBRARY") set_target_properties(${target_name} PROPERTIES CXX_STANDARD ${dialect} CUDA_STANDARD ${dialect} ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}" ) # CMake still emits errors about empty CUDA_ARCHITECTURES when CMP0104 # is set to OLD. This suppresses the errors for good. if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set_target_properties(${target_name} PROPERTIES CUDA_ARCHITECTURES OFF ) endif() if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") set_target_properties(${target_name} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF ) endif() endif() endfunction() # Get a cub property from a target and store it in var_name # cub_get_target_property( [DIALECT|PREFIX] macro(cub_get_target_property prop_var target_name prop) get_property(${prop_var} TARGET ${target_name} PROPERTY _CUB_${prop}) endmacro() # Defines the following string variables in the caller's scope: # - ${target_name}_DIALECT # - ${target_name}_PREFIX macro(cub_get_target_properties target_name) cub_get_target_property(${target_name}_DIALECT ${target_name} DIALECT) cub_get_target_property(${target_name}_PREFIX ${target_name} PREFIX) endmacro() # Set one target's _CUB_* properties to match another target function(cub_clone_target_properties dst_target src_target) cub_get_target_properties(${src_target}) cub_set_target_properties(${dst_target} ${${src_target}_DIALECT} ${${src_target}_PREFIX} ) endfunction() # Set ${var_name} to TRUE or FALSE in the caller's scope function(_cub_is_config_valid var_name dialect) if (CUB_ENABLE_DIALECT_CPP${dialect}) set(${var_name} TRUE PARENT_SCOPE) else() set(${var_name} FALSE PARENT_SCOPE) endif() endfunction() function(_cub_init_target_list) set(CUB_TARGETS "" CACHE INTERNAL "" FORCE) endfunction() function(_cub_add_target_to_target_list target_name dialect prefix) cub_set_target_properties(${target_name} ${dialect} ${prefix}) target_link_libraries(${target_name} INTERFACE CUB::CUB cub.compiler_interface ) if (TARGET cub.thrust) target_link_libraries(${target_name} INTERFACE cub.thrust) endif() set(CUB_TARGETS ${CUB_TARGETS} ${target_name} CACHE INTERNAL "" FORCE) set(label "cpp${dialect}") string(TOLOWER "${label}" label) message(STATUS "Enabling CUB configuration: ${label}") endfunction() # Build a ${CUB_TARGETS} list containing target names for all # requested configurations function(cub_build_target_list) # Clear the list of targets: _cub_init_target_list() # Handle dialect options: set(num_dialects_enabled 0) foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS) if (CUB_IN_THRUST) # Just use Thrust's settings: if (THRUST_ENABLE_MULTICONFIG) set(CUB_ENABLE_DIALECT_CPP${dialect} ${THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}} ) else() set(val OFF) if (dialect EQUAL ${THRUST_CPP_DIALECT}) set(val ON) endif() set(CUB_ENABLE_DIALECT_CPP${dialect} ${val}) endif() else() # Create CMake options: set(default_value OFF) if (dialect EQUAL 14) # Default to just 14 on: set(default_value ON) endif() option(CUB_ENABLE_DIALECT_CPP${dialect} "Generate C++${dialect} build configurations." ${default_value} ) endif() if (CUB_ENABLE_DIALECT_CPP${dialect}) math(EXPR num_dialects_enabled "${num_dialects_enabled} + 1") endif() endforeach() # Ensure that only one C++ dialect is enabled when dialect info is hidden: if ((NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) AND (NOT num_dialects_enabled EQUAL 1)) message(FATAL_ERROR "Only one CUB_ENABLE_DIALECT_CPP## option allowed when " "CUB_ENABLE_CPP_DIALECT_IN_NAMES is OFF." ) endif() # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3: if (CUB_ENABLE_DIALECT_CPP17) cmake_minimum_required(VERSION 3.18.3) endif() # Supported versions of MSVC do not distinguish between C++11 and C++14. # Warn the user that they may be generating a ton of redundant targets. if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CUB_ENABLE_DIALECT_CPP11) message(WARNING "Supported versions of MSVC (2017+) do not distinguish between C++11 " "and C++14. The requested C++11 targets will be built with C++14." ) endif() # Generic config flags: macro(add_flag_option flag docstring default) set(cub_opt "CUB_${flag}") if (CUB_IN_THRUST) set(thrust_opt "THRUST_${flag}") # Use thrust's settings: set(${cub_opt} ${${thrust_opt}}) else() option(${cub_opt} "${docstring}" "${default}") mark_as_advanced(${cub_opt}) endif() endmacro() add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF) add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF) add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF) # Build cub.compiler_interface with warning flags, etc # This must be called before _cub_add_target_to_target_list. cub_build_compiler_targets() # Set up the CUB target while testing out our find_package scripts. find_package(CUB REQUIRED CONFIG NO_DEFAULT_PATH # Only check the explicit path in HINTS: HINTS "${CUB_SOURCE_DIR}" ) # TODO # Some of the iterators and unittests depend on thrust. We should break the # cyclical dependency by migrating CUB's Thrust bits into Thrust. find_package(Thrust ${CUB_VERSION} EXACT CONFIG HINTS "../../" # Check if we are in thrust/dependencies/cub ) if (Thrust_FOUND) thrust_set_CUB_target(CUB::CUB) thrust_create_target(cub.thrust HOST CPP DEVICE CUDA) else() message(STATUS "Thrust was not found. Set CMake variable 'Thrust_DIR' to the " "thrust-config.cmake file of a Thrust ${CUB_VERSION} installation to " "enable additional testing." ) endif() # Build CUB_TARGETS foreach(dialect IN LISTS CUB_CPP_DIALECT_OPTIONS) _cub_is_config_valid(config_valid ${dialect}) if (config_valid) if (NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) set(prefix "cub") else() set(prefix "cub.cpp${dialect}") endif() set(target_name "${prefix}") add_library(${target_name} INTERFACE) # Set configuration metadata for this cub interface target: _cub_add_target_to_target_list(${target_name} ${dialect} ${prefix}) endif() endforeach() # dialects list(LENGTH CUB_TARGETS count) message(STATUS "${count} unique cub.dialect configurations generated") # Top level meta-target. Makes it easier to just build CUB targets when # building both CUB and Thrust. Add all project files here so IDEs will be # aware of them. This will not generate build rules. file(GLOB_RECURSE all_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CUB_SOURCE_DIR}/cub/*.cuh" ) # Add a cub.all target that builds all configs. if (NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) add_custom_target(cub.all) else() add_custom_target(cub.all SOURCES ${all_sources}) # Create meta targets for each config: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) add_custom_target(${config_prefix}.all) add_dependencies(cub.all ${config_prefix}.all) endforeach() endif() endfunction() cub-2.0.1/cmake/CubCompilerHacks.cmake000066400000000000000000000075171434614775400175540ustar00rootroot00000000000000# Set up compiler paths and apply temporary hacks to support NVC++. # This file must be included before enabling any languages. # Temporary hacks to make NVC++ work; this requires you to define # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`. if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") # If using NVC++, don't set CXX compiler if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "") unset(CMAKE_CXX_COMPILER CACHE) message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" " specified a different ISO C++ compiler; NVC++ acts as both, so please" " unset the CMAKE_CXX_COMPILER variable." ) endif() # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't # understand. if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "") unset(CMAKE_CUDA_HOST_COMPILER CACHE) message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" " specified a different host ISO C++ compiler; NVC++ acts as both, so" " please unset the CMAKE_CUDA_HOST_COMPILER variable." ) endif() set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -cuda") set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}") set(CMAKE_CUDA_LINK_EXECUTABLE " -o ") endif () # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't # understand. if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")) if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}")) set(tmp "${CMAKE_CUDA_HOST_COMPILER}") unset(CMAKE_CUDA_HOST_COMPILER CACHE) message(FATAL_ERROR "For convenience, CUB's test harness uses CMAKE_CXX_COMPILER for the " "CUDA host compiler. Refusing to overwrite specified " "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this " "variable. Currently:\n" "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n" "CMAKE_CUDA_HOST_COMPILER=${tmp}" ) endif () set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") endif () # Temporary hacks to make NVC++ work; this requires you to define # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`. if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") # Need 3.17 for the properties used below. cmake_minimum_required(VERSION 3.17) set(CMAKE_CUDA_STANDARD_DEFAULT 03) set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03") set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03") set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE) set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES) set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11") set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11") set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE) set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES) set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14") set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14") set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE) set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES) set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17") set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17") set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE) set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES) include(Internal/FeatureTesting) include(Compiler/CMakeCommonCompilerMacros) cmake_record_cuda_compile_features() set(CMAKE_CUDA_COMPILE_FEATURES ${CMAKE_CUDA03_COMPILE_FEATURES} ${CMAKE_CUDA11_COMPILE_FEATURES} ${CMAKE_CUDA14_COMPILE_FEATURES} ${CMAKE_CUDA17_COMPILE_FEATURES} ${CMAKE_CUDA20_COMPILE_FEATURES} ) endif () cub-2.0.1/cmake/CubCudaConfig.cmake000066400000000000000000000134041434614775400170220ustar00rootroot00000000000000enable_language(CUDA) # # Architecture options: # set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86 90) set(arch_message "CUB: Explicitly enabled compute architectures:") # Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just # reuse them if possible. After we transition to CMake 3.18 CUDA_ARCHITECTURE # target properties this will need to be updated. if (CUB_IN_THRUST) # Configure to use all flags from thrust. See ThrustCudaConfig.cmake for # details. set(CUB_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE}") set(CUB_CUDA_FLAGS_RDC "${THRUST_CUDA_FLAGS_RDC}") set(CUB_CUDA_FLAGS_NO_RDC "${THRUST_CUDA_FLAGS_NO_RDC}") # Update the enabled architectures list from thrust foreach (arch IN LISTS all_archs) if (THRUST_ENABLE_COMPUTE_${arch}) set(CUB_ENABLE_COMPUTE_${arch} True) string(APPEND arch_message " sm_${arch}") else() set(CUB_ENABLE_COMPUTE_${arch} False) endif() endforeach() # Otherwise create cache options and build the flags ourselves: else() # NOT CUB_IN_THRUST # Split CUDA_FLAGS into 3 parts: # # CUB_CUDA_FLAGS_BASE: Common CUDA flags for all targets. # CUB_CUDA_FLAGS_RDC: Additional CUDA flags for targets compiled with RDC. # CUB_CUDA_FLAGS_NO_RDC: Additional CUDA flags for targets compiled without RDC. # # This is necessary because CUDA SMs 5.3, 6.2, and 7.2 do not support RDC, but # we want to always build some targets (e.g. testing/cuda/*) with RDC. # We work around this by building the "always RDC" targets without support for # those SMs. This requires two sets of CUDA_FLAGS. # # Enabling any of those SMs along with the ENABLE_RDC options will result in a # configuration error. # # Because of how CMake handles the CMAKE_CUDA_FLAGS variables, every target # generated in a given directory will use the same value for CMAKE_CUDA_FLAGS, # which is determined at the end of the directory's scope. This means caution # should be used when trying to build different targets with different flags, # since they might not behave as expected. This will improve with CMake 3.18, # which add the DEVICE_LINK genex, fixing the issue with using per-target # CUDA_FLAGS: https://gitlab.kitware.com/cmake/cmake/-/issues/18265 set(CUB_CUDA_FLAGS_BASE "${CMAKE_CUDA_FLAGS}") set(CUB_CUDA_FLAGS_RDC) set(CUB_CUDA_FLAGS_NO_RDC) # Archs that don't support RDC: set(no_rdc_archs 53 62 72) # Find the highest arch: list(SORT all_archs) list(LENGTH all_archs max_idx) math(EXPR max_idx "${max_idx} - 1") list(GET all_archs ${max_idx} highest_arch) set(option_init OFF) if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") set(option_init ON) endif() option(CUB_DISABLE_ARCH_BY_DEFAULT "If ON, then all compute architectures are disabled on the initial CMake run." ${option_init} ) set(option_init ON) if (CUB_DISABLE_ARCH_BY_DEFAULT) set(option_init OFF) endif() set(arch_flags) set(num_archs_enabled 0) foreach (arch IN LISTS all_archs) option(CUB_ENABLE_COMPUTE_${arch} "Enable code generation for sm_${arch}." ${option_init} ) if (CUB_ENABLE_COMPUTE_${arch}) math(EXPR num_archs_enabled "${num_archs_enabled} + 1") if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") if (NOT ${num_archs_enabled} EQUAL 1) message(FATAL_ERROR "NVC++ does not support compilation for multiple device architectures " "at once." ) endif() set(arch_flag "-gpu=cc${arch}") else() set(arch_flag "-gencode arch=compute_${arch},code=sm_${arch}") endif() string(APPEND arch_message " sm_${arch}") string(APPEND CUB_CUDA_FLAGS_NO_RDC " ${arch_flag}") if (NOT arch IN_LIST no_rdc_archs) string(APPEND CUB_CUDA_FLAGS_RDC " ${arch_flag}") endif() endif() endforeach() if (NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") option(CUB_ENABLE_COMPUTE_FUTURE "Enable code generation for tests for compute_${highest_arch}" ${option_init} ) if (CUB_ENABLE_COMPUTE_FUTURE) string(APPEND THRUST_CUDA_FLAGS_BASE " -gencode arch=compute_${highest_arch},code=compute_${highest_arch}" ) string(APPEND arch_message " compute_${highest_arch}") endif() endif() # TODO Once CMake 3.18 is required, use the CUDA_ARCHITECTURE target props string(APPEND CMAKE_CUDA_FLAGS "${arch_flags}") endif() message(STATUS ${arch_message}) # # RDC options: # # RDC is off by default in NVCC and on by default in NVC++. Turning off RDC # isn't currently supported by NVC++. So, we default to RDC off for NVCC and # RDC on for NVC++. set(option_init OFF) if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") set(option_init ON) endif() option(CUB_ENABLE_TESTS_WITH_RDC "Build all CUB tests with RDC; tests that require RDC are not affected by this option." ${option_init} ) option(CUB_ENABLE_EXAMPLES_WITH_RDC "Build all CUB examples with RDC; examples which require RDC are not affected by this option." ${option_init} ) # Check for RDC/SM compatibility and error/warn if necessary set(rdc_supported True) foreach (arch IN LISTS no_rdc_archs) if (CUB_ENABLE_COMPUTE_${arch}) set(rdc_supported False) break() endif() endforeach() set(rdc_opts CUB_ENABLE_TESTS_WITH_RDC CUB_ENABLE_EXAMPLES_WITH_RDC ) set(rdc_requested False) foreach (rdc_opt IN LISTS rdc_opts) if (${rdc_opt}) set(rdc_requested True) break() endif() endforeach() if (rdc_requested AND NOT rdc_supported) string(JOIN ", " no_rdc ${no_rdc_archs}) string(JOIN "\n" opts ${rdc_opts}) message(FATAL_ERROR "Architectures {${no_rdc}} do not support RDC and are incompatible with " "these options:\n${opts}" ) endif() # By default RDC is not used: set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}") cub-2.0.1/cmake/CubHeaderTesting.cmake000066400000000000000000000033371434614775400175520ustar00rootroot00000000000000# For every public header, build a translation unit containing `#include
` # to let the compiler try to figure out warnings in that header if it is not otherwise # included in tests, and also to verify if the headers are modular enough. # .inl files are not globbed for, because they are not supposed to be used as public # entrypoints. # Meta target for all configs' header builds: add_custom_target(cub.all.headers) file(GLOB_RECURSE headers RELATIVE "${CUB_SOURCE_DIR}/cub" CONFIGURE_DEPENDS cub/*.cuh ) set(headertest_srcs) foreach (header IN LISTS headers) set(headertest_src "headers/${header}.cu") configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}") list(APPEND headertest_srcs "${headertest_src}") endforeach() function(cub_add_header_test label definitions) foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) set(headertest_target ${config_prefix}.headers.${label}) add_library(${headertest_target} OBJECT ${headertest_srcs}) target_link_libraries(${headertest_target} PUBLIC ${cub_target}) target_compile_definitions(${headertest_target} PRIVATE ${definitions}) cub_clone_target_properties(${headertest_target} ${cub_target}) add_dependencies(cub.all.headers ${headertest_target}) add_dependencies(${config_prefix}.all ${headertest_target}) endforeach() endfunction() # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: set(header_definitions "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" "CUB_WRAPPED_NAMESPACE=wrapped_cub") cub_add_header_test(base "${header_definitions}") list(APPEND header_definitions "CUB_DISABLE_BF16_SUPPORT") cub_add_header_test(bf16 "${header_definitions}") cub-2.0.1/cmake/CubInstallRules.cmake000066400000000000000000000017521434614775400174440ustar00rootroot00000000000000# Thrust manages its own copy of these rules. Update ThrustInstallRules.cmake # if modifying this file. if (CUB_IN_THRUST) return() endif() # Bring in CMAKE_INSTALL_LIBDIR include(GNUInstallDirs) # CUB is a header library; no need to build anything before installing: set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) install(DIRECTORY "${CUB_SOURCE_DIR}/cub" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" FILES_MATCHING PATTERN "*.cuh" ) install(DIRECTORY "${CUB_SOURCE_DIR}/cub/cmake/" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub" PATTERN cub-header-search EXCLUDE ) # Need to configure a file to store the infix specified in # CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub") configure_file("${CUB_SOURCE_DIR}/cub/cmake/cub-header-search.cmake.in" "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" @ONLY) install(FILES "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" DESTINATION "${install_location}") cub-2.0.1/cmake/CubUtilities.cmake000066400000000000000000000005761434614775400170010ustar00rootroot00000000000000# Enable RDC for a CUDA target. Encapsulates compiler hacks: function(cub_enable_rdc_for_cuda_target target_name) if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") set_target_properties(${target_name} PROPERTIES COMPILE_FLAGS "-gpu=rdc" ) else() set_target_properties(${target_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON ) endif() endfunction() cub-2.0.1/cmake/header_test.in000066400000000000000000000044541434614775400162100ustar00rootroot00000000000000// This source file checks that: // 1) Header compiles without error. // 2) Common macro collisions with platform/system headers are avoided. // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating // a potential macro collision and halts. // // Use raw platform checks instead of the CUB_HOST_COMPILER macros since we // don't want to #include any headers other than the one being tested. // // This is only implemented for MSVC/GCC/Clang. #if defined(_MSC_VER) // MSVC // Fake up an error for MSVC #define CUB_MACRO_CHECK_IMPL(msg) \ /* Print message that looks like an error: */ \ __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__) \ ": error: " #msg)) \ /* abort compilation due to static_assert or syntax error: */ \ static_assert(false, #msg); #define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x) #define CUB_MACRO_CHECK_IMPL1(x) #x #elif defined(__clang__) || defined(__GNUC__) // GCC/clang are easy: #define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg) #define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr) #endif // Hacky way to build a string, but it works on all tested platforms. #define CUB_MACRO_CHECK(MACRO, HEADER) \ CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB \ headers due to conflicts with HEADER macros.) // complex.h conflicts #define I CUB_MACRO_CHECK('I', complex.h) // windows.h conflicts #define small CUB_MACRO_CHECK('small', windows.h) // We can't enable these checks without breaking some builds -- some standard // library implementations unconditionally `#undef` these macros, which then // causes random failures later. // Leaving these commented out as a warning: Here be dragons. //#define min(...) CUB_MACRO_CHECK('min', windows.h) //#define max(...) CUB_MACRO_CHECK('max', windows.h) // termios.h conflicts (NVIDIA/thrust#1547) #define B0 CUB_MACRO_CHECK("B0", termios.h) #include #if defined(CUB_DISABLE_BF16_SUPPORT) #if defined(__CUDA_BF16_TYPES_EXIST__) #error CUB should not include cuda_bf16.h when BF16 support is disabled #endif #endif cub-2.0.1/cub/000077500000000000000000000000001434614775400130535ustar00rootroot00000000000000cub-2.0.1/cub/agent/000077500000000000000000000000001434614775400141515ustar00rootroot00000000000000cub-2.0.1/cub/agent/agent_adjacent_difference.cuh000066400000000000000000000227751434614775400217700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include "../config.cuh" #include "../util_type.cuh" #include "../util_namespace.cuh" #include "../block/block_load.cuh" #include "../block/block_store.cuh" #include "../block/block_adjacent_difference.cuh" #include CUB_NAMESPACE_BEGIN template < int _BLOCK_THREADS, int _ITEMS_PER_THREAD = 1, cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct AgentAdjacentDifferencePolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; template struct AgentDifference { using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using BlockLoad = typename cub::BlockLoadType::type; using BlockStore = typename cub::BlockStoreType::type; using BlockAdjacentDifferenceT = cub::BlockAdjacentDifference; union _TempStorage { typename BlockLoad::TempStorage load; typename BlockStore::TempStorage store; typename BlockAdjacentDifferenceT::TempStorage adjacent_difference; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; static constexpr int SHARED_MEMORY_SIZE = static_cast(sizeof(TempStorage)); _TempStorage &temp_storage; InputIteratorT input_it; LoadIt load_it; InputT *first_tile_previous; OutputIteratorT result; DifferenceOpT difference_op; OffsetT num_items; __device__ __forceinline__ AgentDifference(TempStorage &temp_storage, InputIteratorT input_it, InputT *first_tile_previous, OutputIteratorT result, DifferenceOpT difference_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , input_it(input_it) , load_it( THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it)) , first_tile_previous(first_tile_previous) , result(result) , difference_op(difference_op) , num_items(num_items) {} template __device__ __forceinline__ void consume_tile_impl(int num_remaining, int tile_idx, OffsetT tile_base) { InputT input[ITEMS_PER_THREAD]; OutputT output[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoad(temp_storage.load) .Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base)); } else { BlockLoad(temp_storage.load).Load(load_it + tile_base, input); } CTA_SYNC(); if (ReadLeft) { if (IS_FIRST_TILE) { if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeftPartialTile(input, output, difference_op, num_remaining); } else { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeft(input, output, difference_op); } } else { InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base - 1); if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeftPartialTile(input, output, difference_op, num_remaining, tile_prev_input); } else { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeft(input, output, difference_op, tile_prev_input); } } } else { if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractRightPartialTile(input, output, difference_op, num_remaining); } else { InputT tile_next_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base + ITEMS_PER_TILE); BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractRight(input, output, difference_op, tile_next_input); } } CTA_SYNC(); if (IS_LAST_TILE) { BlockStore(temp_storage.store) .Store(result + tile_base, output, num_remaining); } else { BlockStore(temp_storage.store).Store(result + tile_base, output); } } template __device__ __forceinline__ void consume_tile(int num_remaining, int tile_idx, OffsetT tile_base) { if (tile_idx == 0) { consume_tile_impl(num_remaining, tile_idx, tile_base); } else { consume_tile_impl(num_remaining, tile_idx, tile_base); } } __device__ __forceinline__ void Process(int tile_idx, OffsetT tile_base) { OffsetT num_remaining = num_items - tile_base; if (num_remaining > ITEMS_PER_TILE) // not a last tile { consume_tile(num_remaining, tile_idx, tile_base); } else { consume_tile(num_remaining, tile_idx, tile_base); } } }; template struct AgentDifferenceInit { static constexpr int BLOCK_THREADS = 128; static __device__ __forceinline__ void Process(int tile_idx, InputIteratorT first, InputT *result, OffsetT num_tiles, int items_per_tile) { OffsetT tile_base = static_cast(tile_idx) * items_per_tile; if (tile_base > 0 && tile_idx < num_tiles) { if (ReadLeft) { result[tile_idx] = first[tile_base - 1]; } else { result[tile_idx - 1] = first[tile_base]; } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_histogram.cuh000066400000000000000000001004501434614775400200250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . */ #pragma once #include #include "../util_type.cuh" #include "../block/block_load.cuh" #include "../config.cuh" #include "../grid/grid_queue.cuh" #include "../iterator/cache_modified_input_iterator.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy ******************************************************************************/ /** * */ enum BlockHistogramMemoryPreference { GMEM, SMEM, BLEND }; /** * Parameterizable tuning policy type for AgentHistogram */ template < int _BLOCK_THREADS, ///< Threads per thread block int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue struct AgentHistogramPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue }; static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . */ template < typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed typename SampleIteratorT, ///< Random-access input iterator type for reading samples typename CounterT, ///< Integer type for counting sample occurrences per histogram bin typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel typename OffsetT, ///< Signed integer type for global offsets int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused) struct AgentHistogram { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The sample type of the input iterator using SampleT = cub::detail::value_t; /// The pixel type of SampleT using PixelT = typename CubVector::Type; /// The quad type of SampleT using QuadT = typename CubVector::Type; /// Constants enum { BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM, IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, }; /// Cache load modifier for reading input elements static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; /// Input iterator wrapper type (for applying cache modifier) // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedSampleIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, SampleIteratorT>; /// Pixel input iterator type (for applying cache modifier) typedef CacheModifiedInputIterator WrappedPixelIteratorT; /// Qaud input iterator type (for applying cache modifier) typedef CacheModifiedInputIterator WrappedQuadIteratorT; /// Parameterized BlockLoad type for samples typedef BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM> BlockLoadSampleT; /// Parameterized BlockLoad type for pixels typedef BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM> BlockLoadPixelT; /// Parameterized BlockLoad type for quads typedef BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM> BlockLoadQuadT; /// Shared memory type required by this thread block struct _TempStorage { CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) int tile_idx; // Aliasable storage layout union Aliasable { typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads } aliasable; }; /// Temporary storage type (unionable) struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- /// Reference to temp_storage _TempStorage &temp_storage; /// Sample input iterator (with cache modifier applied, if possible) WrappedSampleIteratorT d_wrapped_samples; /// Native pointer for input samples (possibly NULL if unavailable) SampleT* d_native_samples; /// The number of output bins for each channel int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; /// The number of privatized bins for each channel int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; /// Reference to gmem privatized histograms for each channel CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; /// Reference to final output histograms (gmem) CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; /// The transform operator for determining privatized counter indices from samples, one for each channel PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; /// Whether to prefer privatized smem counters vs privatized global counters bool prefer_smem; //--------------------------------------------------------------------- // Initialize privatized bin counters //--------------------------------------------------------------------- // Initialize privatized bin counters __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) { // Initialize histogram bin counts to zeros #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) { privatized_histograms[CHANNEL][privatized_bin] = 0; } } // Barrier to make sure all threads are done updating counters CTA_SYNC(); } // Initialize privatized bin counters. Specialized for privatized shared-memory counters __device__ __forceinline__ void InitSmemBinCounters() { CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; InitBinCounters(privatized_histograms); } // Initialize privatized bin counters. Specialized for privatized global-memory counters __device__ __forceinline__ void InitGmemBinCounters() { InitBinCounters(d_privatized_histograms); } //--------------------------------------------------------------------- // Update final output histograms //--------------------------------------------------------------------- // Update final output histograms from privatized histograms __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) { // Barrier to make sure all threads are done updating counters CTA_SYNC(); // Apply privatized bin counts to output bin counts #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { int channel_bins = num_privatized_bins[CHANNEL]; for (int privatized_bin = threadIdx.x; privatized_bin < channel_bins; privatized_bin += BLOCK_THREADS) { int output_bin = -1; CounterT count = privatized_histograms[CHANNEL][privatized_bin]; bool is_valid = count > 0; output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); if (output_bin >= 0) { atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); } } } } // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters __device__ __forceinline__ void StoreSmemOutput() { CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; StoreOutput(privatized_histograms); } // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters __device__ __forceinline__ void StoreGmemOutput() { StoreOutput(d_privatized_histograms); } //--------------------------------------------------------------------- // Tile accumulation //--------------------------------------------------------------------- // Accumulate pixels. Specialized for RLE compression. __device__ __forceinline__ void AccumulatePixels( SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD], CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], Int2Type is_rle_compress) { #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { // Bin pixels int bins[PIXELS_PER_THREAD]; #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) { bins[PIXEL] = -1; privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); } CounterT accumulator = 1; #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) { if (bins[PIXEL] != bins[PIXEL + 1]) { if (bins[PIXEL] >= 0) atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); accumulator = 0; } accumulator++; } // Last pixel if (bins[PIXELS_PER_THREAD - 1] >= 0) atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); } } // Accumulate pixels. Specialized for individual accumulation of each pixel. __device__ __forceinline__ void AccumulatePixels( SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD], CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], Int2Type is_rle_compress) { #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) { #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { int bin = -1; privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); if (bin >= 0) atomicAdd(privatized_histograms[CHANNEL] + bin, 1); } } } /** * Accumulate pixel, specialized for smem privatized histogram */ __device__ __forceinline__ void AccumulateSmemPixels( SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD]) { CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); } /** * Accumulate pixel, specialized for gmem privatized histogram */ __device__ __forceinline__ void AccumulateGmemPixels( SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD]) { AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); } //--------------------------------------------------------------------- // Tile loading //--------------------------------------------------------------------- // Load full, aligned tile using pixel iterator (multi-channel) template __device__ __forceinline__ void LoadFullAlignedTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) { typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); // Load using a wrapped pixel iterator BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( d_wrapped_pixels, reinterpret_cast(samples)); } // Load full, aligned tile using quad iterator (single-channel) __device__ __forceinline__ void LoadFullAlignedTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type<1> num_active_channels) { typedef QuadT AliasedQuads[QUADS_PER_THREAD]; WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); // Load using a wrapped quad iterator BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( d_wrapped_quads, reinterpret_cast(samples)); } // Load full, aligned tile __device__ __forceinline__ void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); } // Load full, mis-aligned tile using sample iterator __device__ __forceinline__ void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; // Load using sample iterator BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( d_wrapped_samples + block_offset, reinterpret_cast(samples)); } // Load partially-full, aligned tile using the pixel iterator __device__ __forceinline__ void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); int valid_pixels = valid_samples / NUM_CHANNELS; // Load using a wrapped pixel iterator BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( d_wrapped_pixels, reinterpret_cast(samples), valid_pixels); } // Load partially-full, mis-aligned tile using sample iterator __device__ __forceinline__ void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( d_wrapped_samples + block_offset, reinterpret_cast(samples), valid_samples); } //--------------------------------------------------------------------- // Tile processing //--------------------------------------------------------------------- // Consume a tile of data samples template < bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) bool IS_FULL_TILE> // Whether the tile is full __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) { SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; bool is_valid[PIXELS_PER_THREAD]; // Load tile LoadTile( block_offset, valid_samples, samples, Int2Type(), Int2Type()); // Set valid flags #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); // Accumulate samples if (prefer_smem) AccumulateSmemPixels(samples, is_valid); else AccumulateGmemPixels(samples, is_valid); } // Consume row tiles. Specialized for work-stealing from queue template __device__ __forceinline__ void ConsumeTiles( OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< The number of rows in the region of interest OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest int tiles_per_row, ///< Number of image tiles per row GridQueue tile_queue, Int2Type is_work_stealing) { int num_tiles = num_rows * tiles_per_row; int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; OffsetT num_even_share_tiles = gridDim.x * gridDim.y; while (tile_idx < num_tiles) { int row = tile_idx / tiles_per_row; int col = tile_idx - (row * tiles_per_row); OffsetT row_offset = row * row_stride_samples; OffsetT col_offset = (col * TILE_SAMPLES); OffsetT tile_offset = row_offset + col_offset; if (col == tiles_per_row - 1) { // Consume a partially-full tile at the end of the row OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; ConsumeTile(tile_offset, num_remaining); } else { // Consume full tile ConsumeTile(tile_offset, TILE_SAMPLES); } CTA_SYNC(); // Get next tile if (threadIdx.x == 0) temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; CTA_SYNC(); tile_idx = temp_storage.tile_idx; } } // Consume row tiles. Specialized for even-share (striped across thread blocks) template __device__ __forceinline__ void ConsumeTiles( OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< The number of rows in the region of interest OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest int tiles_per_row, ///< Number of image tiles per row GridQueue tile_queue, Int2Type is_work_stealing) { for (int row = blockIdx.y; row < num_rows; row += gridDim.y) { OffsetT row_begin = row * row_stride_samples; OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); while (tile_offset < row_end) { OffsetT num_remaining = row_end - tile_offset; if (num_remaining < TILE_SAMPLES) { // Consume partial tile ConsumeTile(tile_offset, num_remaining); break; } // Consume full tile ConsumeTile(tile_offset, TILE_SAMPLES); tile_offset += gridDim.x * TILE_SAMPLES; } } } //--------------------------------------------------------------------- // Parameter extraction //--------------------------------------------------------------------- // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) template < CacheLoadModifier _MODIFIER, typename _ValueT, typename _OffsetT> __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) { return itr.ptr; } // Return a native pixel pointer (specialized for other types) template __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) { return NULL; } //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * Constructor */ __device__ __forceinline__ AgentHistogram( TempStorage &temp_storage, ///< Reference to temp_storage SampleIteratorT d_samples, ///< Input data to reduce int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel : temp_storage(temp_storage.Alias()), d_wrapped_samples(d_samples), num_output_bins(num_output_bins), num_privatized_bins(num_privatized_bins), d_output_histograms(d_output_histograms), privatized_decode_op(privatized_decode_op), output_decode_op(output_decode_op), d_native_samples(NativePointer(d_wrapped_samples)), prefer_smem((MEM_PREFERENCE == SMEM) ? true : // prefer smem privatized histograms (MEM_PREFERENCE == GMEM) ? false : // prefer gmem privatized histograms blockIdx.x & 1) // prefer blended privatized histograms { int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; // Initialize the locations of this block's privatized histograms for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); } /** * Consume image */ __device__ __forceinline__ void ConsumeTiles( OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< The number of rows in the region of interest OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest int tiles_per_row, ///< Number of image tiles per row GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks { // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) int quad_mask = AlignBytes::ALIGN_BYTES - 1; int pixel_mask = AlignBytes::ALIGN_BYTES - 1; size_t row_bytes = sizeof(SampleT) * row_stride_samples; bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel // Whether rows are aligned and can be vectorized if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); else ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); } /** * Initialize privatized bin counters. Specialized for privatized shared-memory counters */ __device__ __forceinline__ void InitBinCounters() { if (prefer_smem) InitSmemBinCounters(); else InitGmemBinCounters(); } /** * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters */ __device__ __forceinline__ void StoreOutput() { if (prefer_smem) StoreSmemOutput(); else StoreGmemOutput(); } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_merge_sort.cuh000066400000000000000000000627731434614775400202150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include "../config.cuh" #include "../util_type.cuh" #include "../util_namespace.cuh" #include "../block/block_load.cuh" #include "../block/block_store.cuh" #include "../block/block_merge_sort.cuh" #include CUB_NAMESPACE_BEGIN template < int _BLOCK_THREADS, int _ITEMS_PER_THREAD = 1, cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct AgentMergeSortPolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; /// \brief This agent is responsible for the initial in-tile sorting. template struct AgentBlockSort { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- static constexpr bool KEYS_ONLY = std::is_same::value; using BlockMergeSortT = BlockMergeSort; using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using BlockLoadKeys = typename cub::BlockLoadType::type; using BlockLoadItems = typename cub::BlockLoadType::type; using BlockStoreKeysIt = typename cub::BlockStoreType::type; using BlockStoreItemsIt = typename cub::BlockStoreType::type; using BlockStoreKeysRaw = typename cub::BlockStoreType::type; using BlockStoreItemsRaw = typename cub::BlockStoreType::type; union _TempStorage { typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadItems::TempStorage load_items; typename BlockStoreKeysIt::TempStorage store_keys_it; typename BlockStoreItemsIt::TempStorage store_items_it; typename BlockStoreKeysRaw::TempStorage store_keys_raw; typename BlockStoreItemsRaw::TempStorage store_items_raw; typename BlockMergeSortT::TempStorage block_merge; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; static constexpr int SHARED_MEMORY_SIZE = static_cast(sizeof(TempStorage)); //--------------------------------------------------------------------- // Per thread data //--------------------------------------------------------------------- bool ping; _TempStorage &storage; KeysLoadIt keys_in; ItemsLoadIt items_in; OffsetT keys_count; KeyIteratorT keys_out_it; ValueIteratorT items_out_it; KeyT *keys_out_raw; ValueT *items_out_raw; CompareOpT compare_op; __device__ __forceinline__ AgentBlockSort(bool ping_, TempStorage &storage_, KeysLoadIt keys_in_, ItemsLoadIt items_in_, OffsetT keys_count_, KeyIteratorT keys_out_it_, ValueIteratorT items_out_it_, KeyT *keys_out_raw_, ValueT *items_out_raw_, CompareOpT compare_op_) : ping(ping_) , storage(storage_.Alias()) , keys_in(keys_in_) , items_in(items_in_) , keys_count(keys_count_) , keys_out_it(keys_out_it_) , items_out_it(items_out_it_) , keys_out_raw(keys_out_raw_) , items_out_raw(items_out_raw_) , compare_op(compare_op_) { } __device__ __forceinline__ void Process() { auto tile_idx = static_cast(blockIdx.x); auto num_tiles = static_cast(gridDim.x); auto tile_base = tile_idx * ITEMS_PER_TILE; int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE}); if (tile_idx < num_tiles - 1) { consume_tile(tile_base, ITEMS_PER_TILE); } else { consume_tile(tile_base, items_in_tile); } } template __device__ __forceinline__ void consume_tile(OffsetT tile_base, int num_remaining) { ValueT items_local[ITEMS_PER_THREAD]; if (!KEYS_ONLY) { if (IS_LAST_TILE) { BlockLoadItems(storage.load_items) .Load(items_in + tile_base, items_local, num_remaining, *(items_in + tile_base)); } else { BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local); } CTA_SYNC(); } KeyT keys_local[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { BlockLoadKeys(storage.load_keys) .Load(keys_in + tile_base, keys_local, num_remaining, *(keys_in + tile_base)); } else { BlockLoadKeys(storage.load_keys) .Load(keys_in + tile_base, keys_local); } CTA_SYNC(); if (IS_LAST_TILE) { BlockMergeSortT(storage.block_merge) .Sort(keys_local, items_local, compare_op, num_remaining, keys_local[0]); } else { BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op); } CTA_SYNC(); if (ping) { if (IS_LAST_TILE) { BlockStoreKeysIt(storage.store_keys_it) .Store(keys_out_it + tile_base, keys_local, num_remaining); } else { BlockStoreKeysIt(storage.store_keys_it) .Store(keys_out_it + tile_base, keys_local); } if (!KEYS_ONLY) { CTA_SYNC(); if (IS_LAST_TILE) { BlockStoreItemsIt(storage.store_items_it) .Store(items_out_it + tile_base, items_local, num_remaining); } else { BlockStoreItemsIt(storage.store_items_it) .Store(items_out_it + tile_base, items_local); } } } else { if (IS_LAST_TILE) { BlockStoreKeysRaw(storage.store_keys_raw) .Store(keys_out_raw + tile_base, keys_local, num_remaining); } else { BlockStoreKeysRaw(storage.store_keys_raw) .Store(keys_out_raw + tile_base, keys_local); } if (!KEYS_ONLY) { CTA_SYNC(); if (IS_LAST_TILE) { BlockStoreItemsRaw(storage.store_items_raw) .Store(items_out_raw + tile_base, items_local, num_remaining); } else { BlockStoreItemsRaw(storage.store_items_raw) .Store(items_out_raw + tile_base, items_local); } } } } }; /** * \brief This agent is responsible for partitioning a merge path into equal segments * * There are two sorted arrays to be merged into one array. If the first array * is partitioned between parallel workers by slicing it into ranges of equal * size, there could be a significant workload imbalance. The imbalance is * caused by the fact that the distribution of elements from the second array * is unknown beforehand. Instead, the MergePath is partitioned between workers. * This approach guarantees an equal amount of work being assigned to each worker. * * This approach is outlined in the paper: * Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi:10.1109/IPDPSW.2012.202 */ template < typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT> struct AgentPartition { bool ping; KeyIteratorT keys_ping; KeyT *keys_pong; OffsetT keys_count; OffsetT partition_idx; OffsetT *merge_partitions; CompareOpT compare_op; OffsetT target_merged_tiles_number; int items_per_tile; __device__ __forceinline__ AgentPartition(bool ping, KeyIteratorT keys_ping, KeyT *keys_pong, OffsetT keys_count, OffsetT partition_idx, OffsetT *merge_partitions, CompareOpT compare_op, OffsetT target_merged_tiles_number, int items_per_tile) : ping(ping) , keys_ping(keys_ping) , keys_pong(keys_pong) , keys_count(keys_count) , partition_idx(partition_idx) , merge_partitions(merge_partitions) , compare_op(compare_op) , target_merged_tiles_number(target_merged_tiles_number) , items_per_tile(items_per_tile) {} __device__ __forceinline__ void Process() { OffsetT merged_tiles_number = target_merged_tiles_number / 2; // target_merged_tiles_number is a power of two. OffsetT mask = target_merged_tiles_number - 1; // The first tile number in the tiles group being merged, equal to: // target_merged_tiles_number * (partition_idx / target_merged_tiles_number) OffsetT list = ~mask & partition_idx; OffsetT start = items_per_tile * list; OffsetT size = items_per_tile * merged_tiles_number; // Tile number within the tile group being merged, equal to: // partition_idx / target_merged_tiles_number OffsetT local_tile_idx = mask & partition_idx; OffsetT keys1_beg = (cub::min)(keys_count, start); OffsetT keys1_end = (cub::min)(keys_count, start + size); OffsetT keys2_beg = keys1_end; OffsetT keys2_end = (cub::min)(keys_count, keys2_beg + size); OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); OffsetT partition_diag = ping ? MergePath(keys_ping + keys1_beg, keys_ping + keys2_beg, keys1_end - keys1_beg, keys2_end - keys2_beg, partition_at, compare_op) : MergePath(keys_pong + keys1_beg, keys_pong + keys2_beg, keys1_end - keys1_beg, keys2_end - keys2_beg, partition_at, compare_op); merge_partitions[partition_idx] = keys1_beg + partition_diag; } }; /// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays. template < typename Policy, typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT, typename KeyT, typename ValueT> struct AgentMerge { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using KeysOutputPongIt = KeyIteratorT; using ItemsOutputPongIt = ValueIteratorT; using KeysOutputPingIt = KeyT*; using ItemsOutputPingIt = ValueT*; using BlockStoreKeysPong = typename BlockStoreType::type; using BlockStoreItemsPong = typename BlockStoreType::type; using BlockStoreKeysPing = typename BlockStoreType::type; using BlockStoreItemsPing = typename BlockStoreType::type; /// Parameterized BlockReduce primitive union _TempStorage { typename BlockStoreKeysPing::TempStorage store_keys_ping; typename BlockStoreItemsPing::TempStorage store_items_ping; typename BlockStoreKeysPong::TempStorage store_keys_pong; typename BlockStoreItemsPong::TempStorage store_items_pong; KeyT keys_shared[Policy::ITEMS_PER_TILE + 1]; ValueT items_shared[Policy::ITEMS_PER_TILE + 1]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr bool KEYS_ONLY = std::is_same::value; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; static constexpr int SHARED_MEMORY_SIZE = static_cast(sizeof(TempStorage)); //--------------------------------------------------------------------- // Per thread data //--------------------------------------------------------------------- bool ping; _TempStorage& storage; KeysLoadPingIt keys_in_ping; ItemsLoadPingIt items_in_ping; KeysLoadPongIt keys_in_pong; ItemsLoadPongIt items_in_pong; OffsetT keys_count; KeysOutputPongIt keys_out_pong; ItemsOutputPongIt items_out_pong; KeysOutputPingIt keys_out_ping; ItemsOutputPingIt items_out_ping; CompareOpT compare_op; OffsetT *merge_partitions; OffsetT target_merged_tiles_number; //--------------------------------------------------------------------- // Utility functions //--------------------------------------------------------------------- /** * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array * * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and * stores the result in output[item]. */ template __device__ __forceinline__ void gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2) { if (IS_FULL_TILE) { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = BLOCK_THREADS * item + threadIdx.x; output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; } } else { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = BLOCK_THREADS * item + threadIdx.x; if (idx < count1 + count2) { output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; } } } } /// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid] template __device__ __forceinline__ void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD]) { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = BLOCK_THREADS * item + threadIdx.x; output[idx] = input[item]; } } template __device__ __forceinline__ void consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count) { OffsetT partition_beg = merge_partitions[tile_idx + 0]; OffsetT partition_end = merge_partitions[tile_idx + 1]; // target_merged_tiles_number is a power of two. OffsetT merged_tiles_number = target_merged_tiles_number / 2; OffsetT mask = target_merged_tiles_number - 1; // The first tile number in the tiles group being merged, equal to: // target_merged_tiles_number * (tile_idx / target_merged_tiles_number) OffsetT list = ~mask & tile_idx; OffsetT start = ITEMS_PER_TILE * list; OffsetT size = ITEMS_PER_TILE * merged_tiles_number; OffsetT diag = ITEMS_PER_TILE * tile_idx - start; OffsetT keys1_beg = partition_beg; OffsetT keys1_end = partition_end; OffsetT keys2_beg = (cub::min)(keys_count, 2 * start + size + diag - partition_beg); OffsetT keys2_end = (cub::min)(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end); // Check if it's the last tile in the tile group being merged if (mask == (mask & tile_idx)) { keys1_end = (cub::min)(keys_count, start + size); keys2_end = (cub::min)(keys_count, start + size * 2); } // number of keys per tile // int num_keys1 = static_cast(keys1_end - keys1_beg); int num_keys2 = static_cast(keys2_end - keys2_beg); // load keys1 & keys2 KeyT keys_local[ITEMS_PER_THREAD]; if (ping) { gmem_to_reg(keys_local, keys_in_ping + keys1_beg, keys_in_ping + keys2_beg, num_keys1, num_keys2); } else { gmem_to_reg(keys_local, keys_in_pong + keys1_beg, keys_in_pong + keys2_beg, num_keys1, num_keys2); } reg_to_shared(&storage.keys_shared[0], keys_local); // preload items into registers already // ValueT items_local[ITEMS_PER_THREAD]; if (!KEYS_ONLY) { if (ping) { gmem_to_reg(items_local, items_in_ping + keys1_beg, items_in_ping + keys2_beg, num_keys1, num_keys2); } else { gmem_to_reg(items_local, items_in_pong + keys1_beg, items_in_pong + keys2_beg, num_keys1, num_keys2); } } CTA_SYNC(); // use binary search in shared memory // to find merge path for each of thread // we can use int type here, because the number of // items in shared memory is limited // int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); int keys1_beg_local = MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op); int keys1_end_local = num_keys1; int keys2_beg_local = diag0_local - keys1_beg_local; int keys2_end_local = num_keys2; int num_keys1_local = keys1_end_local - keys1_beg_local; int num_keys2_local = keys2_end_local - keys2_beg_local; // perform serial merge // int indices[ITEMS_PER_THREAD]; SerialMerge(&storage.keys_shared[0], keys1_beg_local, keys2_beg_local + num_keys1, num_keys1_local, num_keys2_local, keys_local, indices, compare_op); CTA_SYNC(); // write keys // if (ping) { if (IS_FULL_TILE) { BlockStoreKeysPing(storage.store_keys_ping) .Store(keys_out_ping + tile_base, keys_local); } else { BlockStoreKeysPing(storage.store_keys_ping) .Store(keys_out_ping + tile_base, keys_local, num_keys1 + num_keys2); } } else { if (IS_FULL_TILE) { BlockStoreKeysPong(storage.store_keys_pong) .Store(keys_out_pong + tile_base, keys_local); } else { BlockStoreKeysPong(storage.store_keys_pong) .Store(keys_out_pong + tile_base, keys_local, num_keys1 + num_keys2); } } // if items are provided, merge them if (!KEYS_ONLY) { CTA_SYNC(); reg_to_shared(&storage.items_shared[0], items_local); CTA_SYNC(); // gather items from shared mem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { items_local[item] = storage.items_shared[indices[item]]; } CTA_SYNC(); // write from reg to gmem // if (ping) { if (IS_FULL_TILE) { BlockStoreItemsPing(storage.store_items_ping) .Store(items_out_ping + tile_base, items_local); } else { BlockStoreItemsPing(storage.store_items_ping) .Store(items_out_ping + tile_base, items_local, count); } } else { if (IS_FULL_TILE) { BlockStoreItemsPong(storage.store_items_pong) .Store(items_out_pong + tile_base, items_local); } else { BlockStoreItemsPong(storage.store_items_pong) .Store(items_out_pong + tile_base, items_local, count); } } } } __device__ __forceinline__ AgentMerge(bool ping_, TempStorage &storage_, KeysLoadPingIt keys_in_ping_, ItemsLoadPingIt items_in_ping_, KeysLoadPongIt keys_in_pong_, ItemsLoadPongIt items_in_pong_, OffsetT keys_count_, KeysOutputPingIt keys_out_ping_, ItemsOutputPingIt items_out_ping_, KeysOutputPongIt keys_out_pong_, ItemsOutputPongIt items_out_pong_, CompareOpT compare_op_, OffsetT *merge_partitions_, OffsetT target_merged_tiles_number_) : ping(ping_) , storage(storage_.Alias()) , keys_in_ping(keys_in_ping_) , items_in_ping(items_in_ping_) , keys_in_pong(keys_in_pong_) , items_in_pong(items_in_pong_) , keys_count(keys_count_) , keys_out_pong(keys_out_pong_) , items_out_pong(items_out_pong_) , keys_out_ping(keys_out_ping_) , items_out_ping(items_out_ping_) , compare_op(compare_op_) , merge_partitions(merge_partitions_) , target_merged_tiles_number(target_merged_tiles_number_) {} __device__ __forceinline__ void Process() { int tile_idx = static_cast(blockIdx.x); int num_tiles = static_cast(gridDim.x); OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE; int tid = static_cast(threadIdx.x); int items_in_tile = static_cast( (cub::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); if (tile_idx < num_tiles - 1) { consume_tile(tid, tile_idx, tile_base, ITEMS_PER_TILE); } else { consume_tile(tid, tile_idx, tile_base, items_in_tile); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_radix_sort_downsweep.cuh000066400000000000000000000706771434614775400223220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread * blocks for participating in device-wide radix sort downsweep . */ #pragma once #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentRadixSortDownsweep */ template < int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input) typename ComputeT, ///< Dominant compute type BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use int _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) typename ScalingType = RegBoundScaling > struct AgentRadixSortDownsweepPolicy : ScalingType { enum { RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) }; static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . */ template < typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< KeyT type typename ValueT, ///< ValueT type typename OffsetT> ///< Signed integer type for global offsets struct AgentRadixSortDownsweep { //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- // Appropriate unsigned-bits representation of KeyT typedef typename Traits::UnsignedBits UnsignedBits; static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY; static const UnsignedBits MAX_KEY = Traits::MAX_KEY; static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; enum { BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_DIGITS = 1 << RADIX_BITS, KEYS_ONLY = std::is_same::value, LOAD_WARP_STRIPED = RANK_ALGORITHM == RADIX_RANK_MATCH || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, }; // Input iterator wrapper type (for applying cache modifier)s using KeysItr = CacheModifiedInputIterator; using ValuesItr = CacheModifiedInputIterator; // Radix ranking type to use using BlockRadixRankT = cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_BASIC, BlockRadixRank, cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MEMOIZE, BlockRadixRank, cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MATCH, BlockRadixRankMatch, cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BlockRadixRankMatchEarlyCounts, BlockRadixRankMatchEarlyCounts>>>>; // Digit extractor type using DigitExtractorT = BFEDigitExtractor; enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD }; // BlockLoad type (keys) using BlockLoadKeysT = BlockLoad; // BlockLoad type (values) using BlockLoadValuesT = BlockLoad; // Value exchange array type typedef ValueT ValueExchangeT[TILE_ITEMS]; /** * Shared memory storage layout */ union __align__(16) _TempStorage { typename BlockLoadKeysT::TempStorage load_keys; typename BlockLoadValuesT::TempStorage load_values; typename BlockRadixRankT::TempStorage radix_rank; struct KeysAndOffsets { UnsignedBits exchange_keys[TILE_ITEMS]; OffsetT relative_bin_offsets[RADIX_DIGITS]; } keys_and_offsets; Uninitialized exchange_values; OffsetT exclusive_digit_prefix[RADIX_DIGITS]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- // Shared storage for this CTA _TempStorage &temp_storage; // Input and output device pointers KeysItr d_keys_in; ValuesItr d_values_in; UnsignedBits *d_keys_out; ValueT *d_values_out; // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // Digit extractor DigitExtractorT digit_extractor; // Whether to short-cirucit int short_circuit; //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- /** * Scatter ranked keys through shared memory, then to device-accessible memory */ template __device__ __forceinline__ void ScatterKeys( UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], OffsetT valid_items) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { UnsignedBits key = temp_storage.keys_and_offsets.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; UnsignedBits digit = digit_extractor.Digit(key); relative_bin_offsets[ITEM] = temp_storage.keys_and_offsets.relative_bin_offsets[digit]; // Un-twiddle key = Traits::TwiddleOut(key); if (FULL_TILE || (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) { d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; } } } /** * Scatter ranked values through shared memory, then to device-accessible memory */ template __device__ __forceinline__ void ScatterValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], OffsetT valid_items) { CTA_SYNC(); ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { exchange_values[ranks[ITEM]] = values[ITEM]; } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; if (FULL_TILE || (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) { d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; } } } /** * Load a tile of keys (specialized for full tile, block load) */ __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, Int2Type warp_striped) { BlockLoadKeysT(temp_storage.load_keys).Load( d_keys_in + block_offset, keys); CTA_SYNC(); } /** * Load a tile of keys (specialized for partial tile, block load) */ __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); BlockLoadKeysT(temp_storage.load_keys).Load( d_keys_in + block_offset, keys, valid_items, oob_item); CTA_SYNC(); } /** * Load a tile of keys (specialized for full tile, warp-striped load) */ __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, Int2Type warp_striped) { LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); } /** * Load a tile of keys (specialized for partial tile, warp-striped load) */ __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); } /** * Load a tile of values (specialized for full tile, block load) */ __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { BlockLoadValuesT(temp_storage.load_values).Load( d_values_in + block_offset, values); CTA_SYNC(); } /** * Load a tile of values (specialized for partial tile, block load) */ __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); BlockLoadValuesT(temp_storage.load_values).Load( d_values_in + block_offset, values, valid_items); CTA_SYNC(); } /** * Load a tile of items (specialized for full tile, warp-striped load) */ __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); } /** * Load a tile of items (specialized for partial tile, warp-striped load) */ __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); } /** * Truck along associated values */ template __device__ __forceinline__ void GatherScatterValues( OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type /*is_keys_only*/) { ValueT values[ITEMS_PER_THREAD]; CTA_SYNC(); LoadValues( values, block_offset, valid_items, Int2Type(), Int2Type()); ScatterValues( values, relative_bin_offsets, ranks, valid_items); } /** * Truck along associated values (specialized for key-only sorting) */ template __device__ __forceinline__ void GatherScatterValues( OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD], int (&/*ranks*/)[ITEMS_PER_THREAD], OffsetT /*block_offset*/, OffsetT /*valid_items*/, Int2Type /*is_keys_only*/) {} /** * Process tile */ template __device__ __forceinline__ void ProcessTile( OffsetT block_offset, const OffsetT &valid_items = TILE_ITEMS) { UnsignedBits keys[ITEMS_PER_THREAD]; int ranks[ITEMS_PER_THREAD]; OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; // Assign default (min/max) value to all keys UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY; // Load tile of keys LoadKeys( keys, block_offset, valid_items, default_key, Int2Type(), Int2Type()); // Twiddle key bits if necessary #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { keys[KEY] = Traits::TwiddleIn(keys[KEY]); } // Rank the twiddled keys int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; BlockRadixRankT(temp_storage.radix_rank).RankKeys( keys, ranks, digit_extractor, exclusive_digit_prefix); CTA_SYNC(); // Share exclusive digit prefix #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { // Store exclusive prefix temp_storage.exclusive_digit_prefix[bin_idx] = exclusive_digit_prefix[track]; } } CTA_SYNC(); // Get inclusive digit prefix int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { // Get inclusive digit prefix from exclusive prefix (higher bins come first) inclusive_digit_prefix[track] = (bin_idx == 0) ? (BLOCK_THREADS * ITEMS_PER_THREAD) : temp_storage.exclusive_digit_prefix[bin_idx - 1]; } else { // Get inclusive digit prefix from exclusive prefix (lower bins come first) inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? (BLOCK_THREADS * ITEMS_PER_THREAD) : temp_storage.exclusive_digit_prefix[bin_idx + 1]; } } } CTA_SYNC(); // Update global scatter base offsets for each digit #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_offset[track] -= exclusive_digit_prefix[track]; temp_storage.keys_and_offsets.relative_bin_offsets[bin_idx] = bin_offset[track]; bin_offset[track] += inclusive_digit_prefix[track]; } } CTA_SYNC(); // Scatter keys ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); // Gather/scatter values GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type()); } //--------------------------------------------------------------------- // Copy shortcut //--------------------------------------------------------------------- /** * Copy tiles within the range of input */ template < typename InputIteratorT, typename T> __device__ __forceinline__ void Copy( InputIteratorT d_in, T *d_out, OffsetT block_offset, OffsetT block_end) { // Simply copy the input while (block_end - block_offset >= TILE_ITEMS) { T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items); CTA_SYNC(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items); block_offset += TILE_ITEMS; } // Clean up last partial tile with guarded-I/O if (block_offset < block_end) { OffsetT valid_items = block_end - block_offset; T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); CTA_SYNC(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); } } /** * Copy tiles within the range of input (specialized for NullType) */ template __device__ __forceinline__ void Copy( InputIteratorT /*d_in*/, NullType * /*d_out*/, OffsetT /*block_offset*/, OffsetT /*block_end*/) {} //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * Constructor */ __device__ __forceinline__ AgentRadixSortDownsweep( TempStorage &temp_storage, OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], OffsetT num_items, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int current_bit, int num_bits) : temp_storage(temp_storage.Alias()), d_keys_in(reinterpret_cast(d_keys_in)), d_values_in(d_values_in), d_keys_out(reinterpret_cast(d_keys_out)), d_values_out(d_values_out), digit_extractor(current_bit, num_bits), short_circuit(1) { #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { this->bin_offset[track] = bin_offset[track]; int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { // Short circuit if the histogram has only bin counts of only zeros or problem-size short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); } } short_circuit = CTA_SYNC_AND(short_circuit); } /** * Constructor */ __device__ __forceinline__ AgentRadixSortDownsweep( TempStorage &temp_storage, OffsetT num_items, OffsetT *d_spine, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int current_bit, int num_bits) : temp_storage(temp_storage.Alias()), d_keys_in(reinterpret_cast(d_keys_in)), d_values_in(d_values_in), d_keys_out(reinterpret_cast(d_keys_out)), d_values_out(d_values_out), digit_extractor(current_bit, num_bits), short_circuit(1) { #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) bin_idx = RADIX_DIGITS - bin_idx - 1; // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); // Load my block's bin offset for my bin bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; } } short_circuit = CTA_SYNC_AND(short_circuit); } /** * Distribute keys from a segment of input tiles. */ __device__ __forceinline__ void ProcessRegion( OffsetT block_offset, OffsetT block_end) { if (short_circuit) { // Copy keys Copy(d_keys_in, d_keys_out, block_offset, block_end); // Copy values Copy(d_values_in, d_values_out, block_offset, block_end); } else { // Process full tiles of tile_items #pragma unroll 1 while (block_end - block_offset >= TILE_ITEMS) { ProcessTile(block_offset); block_offset += TILE_ITEMS; CTA_SYNC(); } // Clean up last partial tile with guarded-I/O if (block_offset < block_end) { ProcessTile(block_offset, block_end - block_offset); } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_radix_sort_histogram.cuh000066400000000000000000000221221434614775400222620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * agent_radix_sort_histogram.cuh implements a stateful abstraction of CUDA * thread blocks for participating in the device histogram kernel used for * one-sweep radix sorting. */ #pragma once #include "../block/block_load.cuh" #include "../block/radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../thread/thread_reduce.cuh" #include "../util_math.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN template < int _BLOCK_THREADS, int _ITEMS_PER_THREAD, int NOMINAL_4B_NUM_PARTS, typename ComputeT, int _RADIX_BITS> struct AgentRadixSortHistogramPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ITEMS_PER_THREAD = _ITEMS_PER_THREAD, /** NUM_PARTS is the number of private histograms (parts) each histogram is split * into. Each warp lane is assigned to a specific part based on the lane * ID. However, lanes with the same ID in different warp use the same private * histogram. This arrangement helps reduce the degree of conflicts in atomic * operations. */ NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)), RADIX_BITS = _RADIX_BITS, }; }; template < int _BLOCK_THREADS, int _RADIX_BITS> struct AgentRadixSortExclusiveSumPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, RADIX_BITS = _RADIX_BITS, }; }; template < typename AgentRadixSortHistogramPolicy, bool IS_DESCENDING, typename KeyT, typename OffsetT> struct AgentRadixSortHistogram { // constants enum { ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD, BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS, RADIX_DIGITS = 1 << RADIX_BITS, MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS, NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS, }; typedef RadixSortTwiddle Twiddle; typedef std::uint32_t ShmemCounterT; typedef ShmemCounterT ShmemAtomicCounterT; typedef typename Traits::UnsignedBits UnsignedBits; struct _TempStorage { ShmemAtomicCounterT bins[MAX_NUM_PASSES][RADIX_DIGITS][NUM_PARTS]; }; struct TempStorage : Uninitialized<_TempStorage> {}; // thread fields // shared memory storage _TempStorage& s; // bins for the histogram OffsetT* d_bins_out; // data to compute the histogram const UnsignedBits* d_keys_in; // number of data items OffsetT num_items; // begin and end bits for sorting int begin_bit, end_bit; // number of sorting passes int num_passes; __device__ __forceinline__ AgentRadixSortHistogram (TempStorage& temp_storage, OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int begin_bit, int end_bit) : s(temp_storage.Alias()), d_bins_out(d_bins_out), d_keys_in(reinterpret_cast(d_keys_in)), num_items(num_items), begin_bit(begin_bit), end_bit(end_bit), num_passes((end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS) {} __device__ __forceinline__ void Init() { // Initialize bins to 0. #pragma unroll for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS) { #pragma unroll for (int pass = 0; pass < num_passes; ++pass) { #pragma unroll for (int part = 0; part < NUM_PARTS; ++part) { s.bins[pass][bin][part] = 0; } } } CTA_SYNC(); } __device__ __forceinline__ void LoadTileKeys(OffsetT tile_offset, UnsignedBits (&keys)[ITEMS_PER_THREAD]) { // tile_offset < num_items always, hence the line below works bool full_tile = num_items - tile_offset >= TILE_ITEMS; if (full_tile) { LoadDirectStriped( threadIdx.x, d_keys_in + tile_offset, keys); } else { LoadDirectStriped( threadIdx.x, d_keys_in + tile_offset, keys, num_items - tile_offset, Twiddle::DefaultKey()); } #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { keys[u] = Twiddle::In(keys[u]); } } __device__ __forceinline__ void AccumulateSharedHistograms(OffsetT tile_offset, UnsignedBits (&keys)[ITEMS_PER_THREAD]) { int part = LaneId() % NUM_PARTS; #pragma unroll for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit); ShiftDigitExtractor digit_extractor(current_bit, num_bits); #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int bin = digit_extractor.Digit(keys[u]); // Using cuda::atomic<> results in lower performance on GP100, // so atomicAdd() is used instead. atomicAdd(&s.bins[pass][bin][part], 1); } } } __device__ __forceinline__ void AccumulateGlobalHistograms() { #pragma unroll for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS) { #pragma unroll for (int pass = 0; pass < num_passes; ++pass) { OffsetT count = internal::ThreadReduce(s.bins[pass][bin], Sum()); if (count > 0) { // Using cuda::atomic<> here would also require using it in // other kernels. However, other kernels of onesweep sorting // (ExclusiveSum, Onesweep) don't need atomic // access. Therefore, atomicAdd() is used, until // cuda::atomic_ref<> becomes available. atomicAdd(&d_bins_out[pass * RADIX_DIGITS + bin], count); } } } } __device__ __forceinline__ void Process() { // Within a portion, avoid overflowing (u)int32 counters. // Between portions, accumulate results in global memory. const OffsetT MAX_PORTION_SIZE = 1 << 30; OffsetT num_portions = cub::DivideAndRoundUp(num_items, MAX_PORTION_SIZE); for (OffsetT portion = 0; portion < num_portions; ++portion) { // Reset the counters. Init(); CTA_SYNC(); // Process the tiles. OffsetT portion_offset = portion * MAX_PORTION_SIZE; OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset); for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x) { OffsetT tile_offset = portion_offset + offset; UnsignedBits keys[ITEMS_PER_THREAD]; LoadTileKeys(tile_offset, keys); AccumulateSharedHistograms(tile_offset, keys); } CTA_SYNC(); // Accumulate the result in global memory. AccumulateGlobalHistograms(); CTA_SYNC(); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_radix_sort_onesweep.cuh000066400000000000000000000601111434614775400221120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * agent_radix_sort_onesweep.cuh implements a stateful abstraction of CUDA * thread blocks for participating in the device one-sweep radix sort kernel. */ #pragma once #include "../block/block_radix_rank.cuh" #include "../block/radix_rank_sort_operations.cuh" #include "../block/block_store.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** \brief cub::RadixSortStoreAlgorithm enumerates different algorithms to write * partitioned elements (keys, values) stored in shared memory into global * memory. Currently applies only to writing 4B keys in full tiles; in all other cases, * RADIX_SORT_STORE_DIRECT is used. */ enum RadixSortStoreAlgorithm { /** \brief Elements are statically distributed among block threads, which write them * into the appropriate partition in global memory. This results in fewer instructions * and more writes in flight at a given moment, but may generate more transactions. */ RADIX_SORT_STORE_DIRECT, /** \brief Elements are distributed among warps in a block distribution. Each warp * goes through its elements and tries to write them while minimizing the number of * memory transactions. This results in fewer memory transactions, but more * instructions and less writes in flight at a given moment. */ RADIX_SORT_STORE_ALIGNED }; template < int NOMINAL_BLOCK_THREADS_4B, int NOMINAL_ITEMS_PER_THREAD_4B, typename ComputeT, /** \brief Number of private histograms to use in the ranker; ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */ int _RANK_NUM_PARTS, /** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that support warp-strided key arrangement and count callbacks are supported. */ RadixRankAlgorithm _RANK_ALGORITHM, BlockScanAlgorithm _SCAN_ALGORITHM, RadixSortStoreAlgorithm _STORE_ALGORITHM, int _RADIX_BITS, typename ScalingType = RegBoundScaling< NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> > struct AgentRadixSortOnesweepPolicy : ScalingType { enum { RANK_NUM_PARTS = _RANK_NUM_PARTS, RADIX_BITS = _RADIX_BITS, }; static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; static const RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; template < typename AgentRadixSortOnesweepPolicy, bool IS_DESCENDING, typename KeyT, typename ValueT, typename OffsetT, typename PortionOffsetT> struct AgentRadixSortOnesweep { // constants enum { ITEMS_PER_THREAD = AgentRadixSortOnesweepPolicy::ITEMS_PER_THREAD, KEYS_ONLY = std::is_same::value, BLOCK_THREADS = AgentRadixSortOnesweepPolicy::BLOCK_THREADS, RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS, RADIX_DIGITS = 1 << RADIX_BITS, BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS, FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS, WARP_THREADS = CUB_PTX_WARP_THREADS, BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS, WARP_MASK = ~0, LOOKBACK_PARTIAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 2), LOOKBACK_GLOBAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 1), LOOKBACK_KIND_MASK = LOOKBACK_PARTIAL_MASK | LOOKBACK_GLOBAL_MASK, LOOKBACK_VALUE_MASK = ~LOOKBACK_KIND_MASK, }; typedef typename Traits::UnsignedBits UnsignedBits; typedef PortionOffsetT AtomicOffsetT; static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortOnesweepPolicy::RANK_ALGORITHM; static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortOnesweepPolicy::SCAN_ALGORITHM; static const RadixSortStoreAlgorithm STORE_ALGORITHM = sizeof(UnsignedBits) == sizeof(uint32_t) ? AgentRadixSortOnesweepPolicy::STORE_ALGORITHM : RADIX_SORT_STORE_DIRECT; typedef RadixSortTwiddle Twiddle; static_assert(RANK_ALGORITHM == RADIX_RANK_MATCH || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, "for onesweep agent, the ranking algorithm must warp-strided key arrangement"); using BlockRadixRankT = cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, BlockRadixRankMatchEarlyCounts, cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MATCH, BlockRadixRankMatch, BlockRadixRankMatchEarlyCounts>>; // temporary storage struct TempStorage_ { union { UnsignedBits keys_out[TILE_ITEMS]; ValueT values_out[TILE_ITEMS]; typename BlockRadixRankT::TempStorage rank_temp_storage; }; union { OffsetT global_offsets[RADIX_DIGITS]; PortionOffsetT block_idx; }; }; using TempStorage = Uninitialized; // thread variables TempStorage_& s; // kernel parameters AtomicOffsetT* d_lookback; AtomicOffsetT* d_ctrs; OffsetT* d_bins_out; const OffsetT* d_bins_in; UnsignedBits* d_keys_out; const UnsignedBits* d_keys_in; ValueT* d_values_out; const ValueT* d_values_in; PortionOffsetT num_items; ShiftDigitExtractor digit_extractor; // other thread variables int warp; int lane; PortionOffsetT block_idx; bool full_block; // helper methods __device__ __forceinline__ int Digit(UnsignedBits key) { return digit_extractor.Digit(key); } __device__ __forceinline__ int ThreadBin(int u) { return threadIdx.x * BINS_PER_THREAD + u; } __device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { // write the local sum into the bin AtomicOffsetT& loc = d_lookback[block_idx * RADIX_DIGITS + bin]; PortionOffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK; ThreadStore(&loc, value); } } } struct CountsCallback { typedef AgentRadixSortOnesweep AgentT; AgentT& agent; int (&bins)[BINS_PER_THREAD]; UnsignedBits (&keys)[ITEMS_PER_THREAD]; static const bool EMPTY = false; __device__ __forceinline__ CountsCallback( AgentT& agent, int (&bins)[BINS_PER_THREAD], UnsignedBits (&keys)[ITEMS_PER_THREAD]) : agent(agent), bins(bins), keys(keys) {} __device__ __forceinline__ void operator()(int (&other_bins)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { bins[u] = other_bins[u]; } agent.LookbackPartial(bins); agent.TryShortCircuit(keys, bins); } }; __device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { PortionOffsetT inc_sum = bins[u]; int want_mask = ~0; // backtrack as long as necessary for (PortionOffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx) { // wait for some value to appear PortionOffsetT value_j = 0; AtomicOffsetT& loc_j = d_lookback[block_jdx * RADIX_DIGITS + bin]; do { __threadfence_block(); // prevent hoisting loads from loop value_j = ThreadLoad(&loc_j); } while (value_j == 0); inc_sum += value_j & LOOKBACK_VALUE_MASK; want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask); if (value_j & LOOKBACK_GLOBAL_MASK) break; } AtomicOffsetT& loc_i = d_lookback[block_idx * RADIX_DIGITS + bin]; PortionOffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK; ThreadStore(&loc_i, value_i); s.global_offsets[bin] += inc_sum - bins[u]; } } } __device__ __forceinline__ void LoadKeys(OffsetT tile_offset, UnsignedBits (&keys)[ITEMS_PER_THREAD]) { if (full_block) { LoadDirectWarpStriped(threadIdx.x, d_keys_in + tile_offset, keys); } else { LoadDirectWarpStriped(threadIdx.x, d_keys_in + tile_offset, keys, num_items - tile_offset, Twiddle::DefaultKey()); } #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { keys[u] = Twiddle::In(keys[u]); } } __device__ __forceinline__ void LoadValues(OffsetT tile_offset, ValueT (&values)[ITEMS_PER_THREAD]) { if (full_block) { LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values); } else { int tile_items = num_items - tile_offset; LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values, tile_items); } } /** Checks whether "short-circuiting" is possible. Short-circuiting happens * if all TILE_ITEMS keys fall into the same bin, i.e. have the same digit * value (note that it only happens for full tiles). If short-circuiting is * performed, the part of the ranking algorithm after the CountsCallback, as * well as the rest of the sorting (e.g. scattering keys and values to * shared and global memory) are skipped; updates related to decoupled * look-back are still performed. Instead, the keys assigned to the current * thread block are written cooperatively into a contiguous location in * d_keys_out corresponding to their digit. The values (if also sorting * values) assigned to the current thread block are similarly copied from * d_values_in to d_values_out. */ __device__ __forceinline__ void TryShortCircuit(UnsignedBits (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD]) { // check if any bin can be short-circuited bool short_circuit = false; #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { if (FULL_BINS || ThreadBin(u) < RADIX_DIGITS) { short_circuit = short_circuit || bins[u] == TILE_ITEMS; } } short_circuit = CTA_SYNC_OR(short_circuit); if (!short_circuit) return; ShortCircuitCopy(keys, bins); } __device__ __forceinline__ void ShortCircuitCopy(UnsignedBits (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD]) { // short-circuit handling; note that global look-back is still required // compute offsets int common_bin = Digit(keys[0]); int offsets[BINS_PER_THREAD]; #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); offsets[u] = bin > common_bin ? TILE_ITEMS : 0; } // global lookback LoadBinsToOffsetsGlobal(offsets); LookbackGlobal(bins); UpdateBinsGlobal(bins, offsets); CTA_SYNC(); // scatter the keys OffsetT global_offset = s.global_offsets[common_bin]; #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { keys[u] = Twiddle::Out(keys[u]); } if (full_block) { StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys); } else { int tile_items = num_items - block_idx * TILE_ITEMS; StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys, tile_items); } if (!KEYS_ONLY) { // gather and scatter the values ValueT values[ITEMS_PER_THREAD]; LoadValues(block_idx * TILE_ITEMS, values); if (full_block) { StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values); } else { int tile_items = num_items - block_idx * TILE_ITEMS; StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values, tile_items); } } // exit early ThreadExit(); } __device__ __forceinline__ void ScatterKeysShared(UnsignedBits (&keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD]) { // write to shared memory #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { s.keys_out[ranks[u]] = keys[u]; } } __device__ __forceinline__ void ScatterValuesShared(ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD]) { // write to shared memory #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { s.values_out[ranks[u]] = values[u]; } } __device__ __forceinline__ void LoadBinsToOffsetsGlobal(int (&offsets)[BINS_PER_THREAD]) { // global offset - global part #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { s.global_offsets[bin] = d_bins_in[bin] - offsets[u]; } } } __device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD], int (&offsets)[BINS_PER_THREAD]) { bool last_block = (block_idx + 1) * TILE_ITEMS >= num_items; if (d_bins_out != NULL && last_block) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { d_bins_out[bin] = s.global_offsets[bin] + offsets[u] + bins[u]; } } } } template __device__ __forceinline__ void ScatterKeysGlobalDirect() { int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS; #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int idx = threadIdx.x + u * BLOCK_THREADS; UnsignedBits key = s.keys_out[idx]; OffsetT global_idx = idx + s.global_offsets[Digit(key)]; if (FULL_TILE || idx < tile_items) { d_keys_out[global_idx] = Twiddle::Out(key); } WARP_SYNC(WARP_MASK); } } template __device__ __forceinline__ void ScatterValuesGlobalDirect(int (&digits)[ITEMS_PER_THREAD]) { int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS; #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int idx = threadIdx.x + u * BLOCK_THREADS; ValueT value = s.values_out[idx]; OffsetT global_idx = idx + s.global_offsets[digits[u]]; if (FULL_TILE || idx < tile_items) d_values_out[global_idx] = value; WARP_SYNC(WARP_MASK); } } __device__ __forceinline__ void ScatterKeysGlobalAligned() { // this only works with full tiles const int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS; const int ALIGN = 8; const auto CACHE_MODIFIER = STORE_CG; int warp_start = warp * ITEMS_PER_WARP; int warp_end = (warp + 1) * ITEMS_PER_WARP; int warp_offset = warp_start; while (warp_offset < warp_end - WARP_THREADS) { int idx = warp_offset + lane; UnsignedBits key = s.keys_out[idx]; UnsignedBits key_out = Twiddle::Out(key); OffsetT global_idx = idx + s.global_offsets[Digit(key)]; int last_lane = WARP_THREADS - 1; int num_writes = WARP_THREADS; if (lane == last_lane) { num_writes -= int(global_idx + 1) % ALIGN; } num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK); if (lane < num_writes) { ThreadStore(&d_keys_out[global_idx], key_out); } warp_offset += num_writes; } { int num_writes = warp_end - warp_offset; if (lane < num_writes) { int idx = warp_offset + lane; UnsignedBits key = s.keys_out[idx]; OffsetT global_idx = idx + s.global_offsets[Digit(key)]; ThreadStore(&d_keys_out[global_idx], Twiddle::Out(key)); } } } __device__ __forceinline__ void ScatterKeysGlobal() { // write block data to global memory if (full_block) { if (STORE_ALGORITHM == RADIX_SORT_STORE_ALIGNED) { ScatterKeysGlobalAligned(); } else { ScatterKeysGlobalDirect(); } } else { ScatterKeysGlobalDirect(); } } __device__ __forceinline__ void ScatterValuesGlobal(int (&digits)[ITEMS_PER_THREAD]) { // write block data to global memory if (full_block) { ScatterValuesGlobalDirect(digits); } else { ScatterValuesGlobalDirect(digits); } } __device__ __forceinline__ void ComputeKeyDigits(int (&digits)[ITEMS_PER_THREAD]) { #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int idx = threadIdx.x + u * BLOCK_THREADS; digits[u] = Digit(s.keys_out[idx]); } } __device__ __forceinline__ void GatherScatterValues( int (&ranks)[ITEMS_PER_THREAD], Int2Type keys_only) { // compute digits corresponding to the keys int digits[ITEMS_PER_THREAD]; ComputeKeyDigits(digits); // load values ValueT values[ITEMS_PER_THREAD]; LoadValues(block_idx * TILE_ITEMS, values); // scatter values CTA_SYNC(); ScatterValuesShared(values, ranks); CTA_SYNC(); ScatterValuesGlobal(digits); } __device__ __forceinline__ void GatherScatterValues( int (&ranks)[ITEMS_PER_THREAD], Int2Type keys_only) {} __device__ __forceinline__ void Process() { // load keys // if warp1 < warp2, all elements of warp1 occur before those of warp2 // in the source array UnsignedBits keys[ITEMS_PER_THREAD]; LoadKeys(block_idx * TILE_ITEMS, keys); // rank keys int ranks[ITEMS_PER_THREAD]; int exclusive_digit_prefix[BINS_PER_THREAD]; int bins[BINS_PER_THREAD]; BlockRadixRankT(s.rank_temp_storage).RankKeys( keys, ranks, digit_extractor, exclusive_digit_prefix, CountsCallback(*this, bins, keys)); // scatter keys in shared memory CTA_SYNC(); ScatterKeysShared(keys, ranks); // compute global offsets LoadBinsToOffsetsGlobal(exclusive_digit_prefix); LookbackGlobal(bins); UpdateBinsGlobal(bins, exclusive_digit_prefix); // scatter keys in global memory CTA_SYNC(); ScatterKeysGlobal(); // scatter values if necessary GatherScatterValues(ranks, Int2Type()); } __device__ __forceinline__ // AgentRadixSortOnesweep(TempStorage &temp_storage, AtomicOffsetT *d_lookback, AtomicOffsetT *d_ctrs, OffsetT *d_bins_out, const OffsetT *d_bins_in, KeyT *d_keys_out, const KeyT *d_keys_in, ValueT *d_values_out, const ValueT *d_values_in, PortionOffsetT num_items, int current_bit, int num_bits) : s(temp_storage.Alias()) , d_lookback(d_lookback) , d_ctrs(d_ctrs) , d_bins_out(d_bins_out) , d_bins_in(d_bins_in) , d_keys_out(reinterpret_cast(d_keys_out)) , d_keys_in(reinterpret_cast(d_keys_in)) , d_values_out(d_values_out) , d_values_in(d_values_in) , num_items(num_items) , digit_extractor(current_bit, num_bits) , warp(threadIdx.x / WARP_THREADS) , lane(LaneId()) { // initialization if (threadIdx.x == 0) { s.block_idx = atomicAdd(d_ctrs, 1); } CTA_SYNC(); block_idx = s.block_idx; full_block = (block_idx + 1) * TILE_ITEMS <= num_items; } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_radix_sort_upsweep.cuh000066400000000000000000000427261434614775400217710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . */ #pragma once #include "../thread/thread_reduce.cuh" #include "../thread/thread_load.cuh" #include "../warp/warp_reduce.cuh" #include "../block/block_load.cuh" #include "../block/radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_type.cuh" #include "../iterator/cache_modified_input_iterator.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentRadixSortUpsweep */ template < int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input) typename ComputeT, ///< Dominant compute type CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys int _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) typename ScalingType = RegBoundScaling > struct AgentRadixSortUpsweepPolicy : ScalingType { enum { RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) }; static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . */ template < typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type typename KeyT, ///< KeyT type typename OffsetT> ///< Signed integer type for global offsets struct AgentRadixSortUpsweep { //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- typedef typename Traits::UnsignedBits UnsignedBits; // Integer type for digit counters (to be packed into words of PackedCounters) typedef unsigned char DigitCounter; // Integer type for packing DigitCounters into columns of shared memory banks typedef unsigned int PackedCounter; static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; enum { RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, BYTES_PER_COUNTER = sizeof(DigitCounter), LOG_BYTES_PER_COUNTER = Log2::VALUE, PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), LOG_PACKING_RATIO = Log2::VALUE, LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // To prevent counter overflow, we must periodically unpack and aggregate the // digit counters back into registers. Each counter lane is assigned to a // warp for aggregation. LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), // Unroll tiles in batches without risk of counter overflow UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, }; // Input iterator wrapper type (for applying cache modifier)s typedef CacheModifiedInputIterator KeysItr; // Digit extractor type typedef BFEDigitExtractor DigitExtractorT; /** * Shared memory storage layout */ union __align__(16) _TempStorage { DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Thread fields (aggregate state bundle) //--------------------------------------------------------------------- // Shared storage for this CTA _TempStorage &temp_storage; // Thread-local counters for periodically aggregating composite-counter lanes OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; // Input and output device pointers KeysItr d_keys_in; // Digit extractor DigitExtractorT digit_extractor; //--------------------------------------------------------------------- // Helper structure for templated iteration //--------------------------------------------------------------------- // Iterate template struct Iterate { // BucketKeys static __device__ __forceinline__ void BucketKeys( AgentRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) { cta.Bucket(keys[COUNT]); // Next Iterate::BucketKeys(cta, keys); } }; // Terminate template struct Iterate { // BucketKeys static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {} }; //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- /** * Decode a key and increment corresponding smem digit counter */ __device__ __forceinline__ void Bucket(UnsignedBits key) { // Perform transform op UnsignedBits converted_key = Traits::TwiddleIn(key); // Extract current digit bits UnsignedBits digit = digit_extractor.Digit(converted_key); // Get sub-counter offset UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); // Get row offset UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; // Increment counter temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; } /** * Reset composite counters */ __device__ __forceinline__ void ResetDigitCounters() { #pragma unroll for (int LANE = 0; LANE < COUNTER_LANES; LANE++) { temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; } } /** * Reset the unpacked counters in each thread */ __device__ __forceinline__ void ResetUnpackedCounters() { #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { local_counts[LANE][UNPACKED_COUNTER] = 0; } } } /** * Extracts and aggregates the digit counters for each counter lane * owned by this warp */ __device__ __forceinline__ void UnpackDigitCounts() { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { const int counter_lane = (LANE * WARPS) + warp_id; if (counter_lane < COUNTER_LANES) { #pragma unroll for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) { #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; local_counts[LANE][UNPACKED_COUNTER] += counter; } } } } } /** * Processes a single, full tile */ __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset) { // Tile of keys UnsignedBits keys[KEYS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); // Prevent hoisting CTA_SYNC(); // Bucket tile of keys Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); } /** * Processes a single load (may have some threads masked off) */ __device__ __forceinline__ void ProcessPartialTile( OffsetT block_offset, const OffsetT &block_end) { // Process partial tile if necessary using single loads for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) { // Load and bucket key UnsignedBits key = d_keys_in[block_offset + offset]; Bucket(key); } } //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * Constructor */ __device__ __forceinline__ AgentRadixSortUpsweep( TempStorage &temp_storage, const KeyT *d_keys_in, int current_bit, int num_bits) : temp_storage(temp_storage.Alias()), d_keys_in(reinterpret_cast(d_keys_in)), digit_extractor(current_bit, num_bits) {} /** * Compute radix digit histograms from a segment of input tiles. */ __device__ __forceinline__ void ProcessRegion( OffsetT block_offset, const OffsetT &block_end) { // Reset digit counters in smem and unpacked counters in registers ResetDigitCounters(); ResetUnpackedCounters(); // Unroll batches of full tiles while (block_end - block_offset >= UNROLLED_ELEMENTS) { for (int i = 0; i < UNROLL_COUNT; ++i) { ProcessFullTile(block_offset); block_offset += TILE_ITEMS; } CTA_SYNC(); // Aggregate back into local_count registers to prevent overflow UnpackDigitCounts(); CTA_SYNC(); // Reset composite counters in lanes ResetDigitCounters(); } // Unroll single full tiles while (block_end - block_offset >= TILE_ITEMS) { ProcessFullTile(block_offset); block_offset += TILE_ITEMS; } // Process partial tile if necessary ProcessPartialTile( block_offset, block_end); CTA_SYNC(); // Aggregate back into local_count registers UnpackDigitCounts(); } /** * Extract counts (saving them to the external array) */ template __device__ __forceinline__ void ExtractCounts( OffsetT *counters, int bin_stride = 1, int bin_offset = 0) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); // Place unpacked digit counters in shared memory #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { int counter_lane = (LANE * WARPS) + warp_id; if (counter_lane < COUNTER_LANES) { int digit_row = counter_lane << LOG_PACKING_RATIO; #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { int bin_idx = digit_row + UNPACKED_COUNTER; temp_storage.block_counters[warp_tid][bin_idx] = local_counts[LANE][UNPACKED_COUNTER]; } } } CTA_SYNC(); // Rake-reduce bin_count reductions // Whole blocks #pragma unroll for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; BIN_BASE += BLOCK_THREADS) { int bin_idx = BIN_BASE + threadIdx.x; OffsetT bin_count = 0; #pragma unroll for (int i = 0; i < WARP_THREADS; ++i) bin_count += temp_storage.block_counters[i][bin_idx]; if (IS_DESCENDING) bin_idx = RADIX_DIGITS - bin_idx - 1; counters[(bin_stride * bin_idx) + bin_offset] = bin_count; } // Remainder if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) { int bin_idx = threadIdx.x; OffsetT bin_count = 0; #pragma unroll for (int i = 0; i < WARP_THREADS; ++i) bin_count += temp_storage.block_counters[i][bin_idx]; if (IS_DESCENDING) bin_idx = RADIX_DIGITS - bin_idx - 1; counters[(bin_stride * bin_idx) + bin_offset] = bin_count; } } /** * Extract counts */ template __device__ __forceinline__ void ExtractCounts( OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); // Place unpacked digit counters in shared memory #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { int counter_lane = (LANE * WARPS) + warp_id; if (counter_lane < COUNTER_LANES) { int digit_row = counter_lane << LOG_PACKING_RATIO; #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { int bin_idx = digit_row + UNPACKED_COUNTER; temp_storage.block_counters[warp_tid][bin_idx] = local_counts[LANE][UNPACKED_COUNTER]; } } } CTA_SYNC(); // Rake-reduce bin_count reductions #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_count[track] = 0; #pragma unroll for (int i = 0; i < WARP_THREADS; ++i) bin_count[track] += temp_storage.block_counters[i][bin_idx]; } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_reduce.cuh000066400000000000000000000414321434614775400173030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::AgentReduce implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduction. */ #pragma once #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentReduce * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input) * @tparam ComputeT Dominant compute type * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load * @tparam _BLOCK_ALGORITHM Cooperative block-wide reduction algorithm to use * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements */ template > struct AgentReducePolicy : ScalingType { /// Number of items per vectorized load static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH; /// Cooperative block-wide reduction algorithm to use static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks * for participating in device-wide reduction . * * Each thread reduces only the values it loads. If `FIRST_TILE`, this partial * reduction is stored into `thread_aggregate`. Otherwise it is accumulated * into `thread_aggregate`. * * @tparam AgentReducePolicy * Parameterized AgentReducePolicy tuning policy type * * @tparam InputIteratorT * Random-access iterator type for input * * @tparam OutputIteratorT * Random-access iterator type for output * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOp * Binary reduction operator type having member * `auto operator()(T &&a, U &&b)` * * @tparam AccumT * The type of intermediate accumulator (according to P2322R6) */ template struct AgentReduce { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The input value type using InputT = cub::detail::value_t; /// Vector type of InputT for data movement using VectorT = typename CubVector::Type; /// Input iterator wrapper type (for applying cache modifier) // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, InputIteratorT>; /// Constants static constexpr int BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD; static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH); // Can vectorize according to the policy if the input iterator is a native // pointer to a primitive type static constexpr bool ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && (std::is_pointer::value) && Traits::PRIMITIVE; static constexpr CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; /// Parameterized BlockReduce primitive using BlockReduceT = BlockReduce; /// Shared memory type required by this thread block struct _TempStorage { typename BlockReduceT::TempStorage reduce; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage &temp_storage; ///< Reference to temp_storage InputIteratorT d_in; ///< Input data to reduce WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce ReductionOp reduction_op; ///< Binary reduction operator //--------------------------------------------------------------------- // Utility //--------------------------------------------------------------------- // Whether or not the input is aligned with the vector type (specialized for // types we can vectorize) template static __device__ __forceinline__ bool IsAligned(Iterator d_in, Int2Type /*can_vectorize*/) { return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; } // Whether or not the input is aligned with the vector type (specialized for // types we cannot vectorize) template static __device__ __forceinline__ bool IsAligned(Iterator /*d_in*/, Int2Type /*can_vectorize*/) { return false; } //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @brief Constructor * @param temp_storage Reference to temp_storage * @param d_in Input data to reduce * @param reduction_op Binary reduction operator */ __device__ __forceinline__ AgentReduce(TempStorage &temp_storage, InputIteratorT d_in, ReductionOp reduction_op) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_wrapped_in(d_in) , reduction_op(reduction_op) {} //--------------------------------------------------------------------- // Tile consumption //--------------------------------------------------------------------- /** * @brief Consume a full tile of input (non-vectorized) * @param block_offset The offset the tile to consume * @param valid_items The number of valid items in the tile * @param is_full_tile Whether or not this is a full tile * @param can_vectorize Whether or not we can vectorize loads */ template __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate, OffsetT block_offset, int /*valid_items*/, Int2Type /*is_full_tile*/, Int2Type /*can_vectorize*/) { AccumT items[ITEMS_PER_THREAD]; // Load items in striped fashion LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? internal::ThreadReduce(items, reduction_op) : internal::ThreadReduce(items, reduction_op, thread_aggregate); } /** * Consume a full tile of input (vectorized) * @param block_offset The offset the tile to consume * @param valid_items The number of valid items in the tile * @param is_full_tile Whether or not this is a full tile * @param can_vectorize Whether or not we can vectorize loads */ template __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate, OffsetT block_offset, int /*valid_items*/, Int2Type /*is_full_tile*/, Int2Type /*can_vectorize*/) { // Alias items as an array of VectorT and load it in striped fashion enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; // Fabricate a vectorized input iterator InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); CacheModifiedInputIterator d_vec_in(reinterpret_cast(d_in_unqualified)); // Load items as vector items InputT input_items[ITEMS_PER_THREAD]; VectorT *vec_items = reinterpret_cast(input_items); #pragma unroll for (int i = 0; i < WORDS; ++i) { vec_items[i] = d_vec_in[BLOCK_THREADS * i]; } // Convert from input type to output type AccumT items[ITEMS_PER_THREAD]; #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { items[i] = input_items[i]; } // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? internal::ThreadReduce(items, reduction_op) : internal::ThreadReduce(items, reduction_op, thread_aggregate); } /** * Consume a partial tile of input * @param block_offset The offset the tile to consume * @param valid_items The number of valid items in the tile * @param is_full_tile Whether or not this is a full tile * @param can_vectorize Whether or not we can vectorize loads */ template __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate, OffsetT block_offset, int valid_items, Int2Type /*is_full_tile*/, Int2Type /*can_vectorize*/) { // Partial tile int thread_offset = threadIdx.x; // Read first item if ((IS_FIRST_TILE) && (thread_offset < valid_items)) { thread_aggregate = d_wrapped_in[block_offset + thread_offset]; thread_offset += BLOCK_THREADS; } // Continue reading items (block-striped) while (thread_offset < valid_items) { InputT item(d_wrapped_in[block_offset + thread_offset]); thread_aggregate = reduction_op(thread_aggregate, item); thread_offset += BLOCK_THREADS; } } //--------------------------------------------------------------- // Consume a contiguous segment of tiles //--------------------------------------------------------------------- /** * @brief Reduce a contiguous segment of input tiles * @param even_share GridEvenShare descriptor * @param can_vectorize Whether or not we can vectorize loads */ template __device__ __forceinline__ AccumT ConsumeRange(GridEvenShare &even_share, Int2Type can_vectorize) { AccumT thread_aggregate{}; if (even_share.block_offset + TILE_ITEMS > even_share.block_end) { // First tile isn't full (not all threads have valid items) int valid_items = even_share.block_end - even_share.block_offset; ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); return BlockReduceT(temp_storage.reduce) .Reduce(thread_aggregate, reduction_op, valid_items); } // At least one full block ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); even_share.block_offset += even_share.block_stride; // Consume subsequent full tiles of input while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) { ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); even_share.block_offset += even_share.block_stride; } // Consume a partially-full tile if (even_share.block_offset < even_share.block_end) { int valid_items = even_share.block_end - even_share.block_offset; ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); } // Compute block-wide reduction (all threads have valid items) return BlockReduceT(temp_storage.reduce) .Reduce(thread_aggregate, reduction_op); } /** * @brief Reduce a contiguous segment of input tiles * @param[in] block_offset Threadblock begin offset (inclusive) * @param[in] block_end Threadblock end offset (exclusive) */ __device__ __forceinline__ AccumT ConsumeRange(OffsetT block_offset, OffsetT block_end) { GridEvenShare even_share; even_share.template BlockInit(block_offset, block_end); return (IsAligned(d_in + block_offset, Int2Type())) ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); } /** * Reduce a contiguous segment of input tiles * @param[in] even_share GridEvenShare descriptor */ __device__ __forceinline__ AccumT ConsumeTiles(GridEvenShare &even_share) { // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread // block even_share.template BlockInit(); return (IsAligned(d_in, Int2Type())) ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_reduce_by_key.cuh000066400000000000000000000556471434614775400206620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduce-value-by-key. */ #pragma once #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentReduceByKey * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use */ template struct AgentReduceByKeyPolicy { ///< Threads per thread block static constexpr int BLOCK_THREADS = _BLOCK_THREADS; ///< Items per thread (per tile of input) static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; ///< The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< The BlockScan algorithm to use static constexpr const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduce-value-by-key * * @tparam AgentReduceByKeyPolicyT * Parameterized AgentReduceByKeyPolicy tuning policy type * * @tparam KeysInputIteratorT * Random-access input iterator type for keys * * @tparam UniqueOutputIteratorT * Random-access output iterator type for keys * * @tparam ValuesInputIteratorT * Random-access input iterator type for values * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam NumRunsOutputIteratorT * Output iterator type for recording number of items selected * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentReduceByKey { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input keys type using KeyInputT = cub::detail::value_t; // The output keys type using KeyOutputT = cub::detail::non_void_value_t; // The input values type using ValueInputT = cub::detail::value_t; // Tuple type for scanning (pairs accumulated segment-value with // segment-index) using OffsetValuePairT = KeyValuePair; // Tuple type for pairing keys and values using KeyValuePairT = KeyValuePair; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; // Guarded inequality functor template struct GuardedInequalityWrapper { /// Wrapped equality operator _EqualityOpT op; /// Items remaining int num_remaining; /// Constructor __host__ __device__ __forceinline__ GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op) , num_remaining(num_remaining) {} /// Boolean inequality operator, returns (a != b) template __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const { if (idx < num_remaining) { return !op(a, b); // In bounds } // Return true if first out-of-bounds item, false otherwise return (idx == num_remaining); } }; // Constants static constexpr int BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD; static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr int TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1); // Whether or not the scan operation has a zero-valued identity value (true // if we're performing addition on a primitive type) static constexpr int HAS_IDENTITY_ZERO = (std::is_same::value) && (Traits::PRIMITIVE); // Cache-modified Input iterator wrapper type (for applying cache modifier) // for keys Wrap the native input pointer with // CacheModifiedValuesInputIterator or directly use the supplied input // iterator type using WrappedKeysInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, KeysInputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) // for values Wrap the native input pointer with // CacheModifiedValuesInputIterator or directly use the supplied input // iterator type using WrappedValuesInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, ValuesInputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) // for fixup values Wrap the native input pointer with // CacheModifiedValuesInputIterator or directly use the supplied input // iterator type using WrappedFixupInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, AggregatesOutputIteratorT>; // Reduce-value-by-segment scan operator using ReduceBySegmentOpT = ReduceBySegmentOp; // Parameterized BlockLoad type for keys using BlockLoadKeysT = BlockLoad; // Parameterized BlockLoad type for values using BlockLoadValuesT = BlockLoad; // Parameterized BlockDiscontinuity type for keys using BlockDiscontinuityKeys = BlockDiscontinuity; // Parameterized BlockScan type using BlockScanT = BlockScan; // Callback type for obtaining tile prefix during block scan using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Key and value exchange types typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; typedef AccumT ValueExchangeT[TILE_ITEMS + 1]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for discontinuity detection typename BlockDiscontinuityKeys::TempStorage discontinuity; } scan_storage; // Smem needed for loading keys typename BlockLoadKeysT::TempStorage load_keys; // Smem needed for loading values typename BlockLoadValuesT::TempStorage load_values; // Smem needed for compacting key value pairs(allows non POD items in this // union) Uninitialized raw_exchange; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- /// Reference to temp_storage _TempStorage &temp_storage; /// Input keys WrappedKeysInputIteratorT d_keys_in; /// Unique output keys UniqueOutputIteratorT d_unique_out; /// Input values WrappedValuesInputIteratorT d_values_in; /// Output value aggregates AggregatesOutputIteratorT d_aggregates_out; /// Output pointer for total number of segments identified NumRunsOutputIteratorT d_num_runs_out; /// KeyT equality operator EqualityOpT equality_op; /// Reduction operator ReductionOpT reduction_op; /// Reduce-by-segment scan operator ReduceBySegmentOpT scan_op; //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param d_keys_in * Input keys * * @param d_unique_out * Unique output keys * * @param d_values_in * Input values * * @param d_aggregates_out * Output value aggregates * * @param d_num_runs_out * Output pointer for total number of segments identified * * @param equality_op * KeyT equality operator * * @param reduction_op * ValueT reduction operator */ __device__ __forceinline__ AgentReduceByKey(TempStorage &temp_storage, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op) : temp_storage(temp_storage.Alias()) , d_keys_in(d_keys_in) , d_unique_out(d_unique_out) , d_values_in(d_values_in) , d_aggregates_out(d_aggregates_out) , d_num_runs_out(d_num_runs_out) , equality_op(equality_op) , reduction_op(reduction_op) , scan_op(reduction_op) {} //--------------------------------------------------------------------- // Scatter utility methods //--------------------------------------------------------------------- /** * Directly scatter flagged items to output offsets */ __device__ __forceinline__ void ScatterDirect(KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], OffsetT (&segment_indices)[ITEMS_PER_THREAD]) { // Scatter flagged keys and values #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (segment_flags[ITEM]) { d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; } } } /** * 2-phase scatter flagged items to output offsets * * The exclusive scan causes each head flag to be paired with the previous * value aggregate: the scatter offsets must be decremented for value * aggregates */ __device__ __forceinline__ void ScatterTwoPhase(KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], OffsetT (&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) { CTA_SYNC(); // Compact and scatter pairs #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (segment_flags[ITEM]) { temp_storage.raw_exchange .Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; } } CTA_SYNC(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; d_unique_out[num_tile_segments_prefix + item] = pair.key; d_aggregates_out[num_tile_segments_prefix + item] = pair.value; } } /** * Scatter flagged items */ __device__ __forceinline__ void Scatter(KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], OffsetT (&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) { // Do a one-phase scatter if (a) two-phase is disabled or (b) the average // number of selected items per thread is less than one if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) { ScatterTwoPhase(scatter_items, segment_flags, segment_indices, num_tile_segments, num_tile_segments_prefix); } else { ScatterDirect(scatter_items, segment_flags, segment_indices); } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * @brief Process a tile of input (dynamic chained scan) * * @tparam IS_LAST_TILE * Whether the current tile is the last tile * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor */ template __device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state) { // Tile keys KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys shuffled up KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile values AccumT values[ITEMS_PER_THREAD]; // Segment head flags OffsetT head_flags[ITEMS_PER_THREAD]; // Segment indices OffsetT segment_indices[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Load keys if (IS_LAST_TILE) { BlockLoadKeysT(temp_storage.load_keys) .Load(d_keys_in + tile_offset, keys, num_remaining); } else { BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } // Load tile predecessor key in first thread KeyOutputT tile_predecessor; if (threadIdx.x == 0) { // if (tile_idx == 0) // first tile gets repeat of first item (thus first item will not // be flagged as a head) // else // Subsequent tiles get last key from previous tile tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1]; } CTA_SYNC(); // Load values if (IS_LAST_TILE) { BlockLoadValuesT(temp_storage.load_values) .Load(d_values_in + tile_offset, values, num_remaining); } else { BlockLoadValuesT(temp_storage.load_values) .Load(d_values_in + tile_offset, values); } CTA_SYNC(); // Initialize head-flags and shuffle up the previous keys if (IS_LAST_TILE) { // Use custom flag operator to additionally flag the first out-of-bounds // item GuardedInequalityWrapper flag_op(equality_op, num_remaining); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(head_flags, keys, prev_keys, flag_op, tile_predecessor); } else { InequalityWrapper flag_op(equality_op); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(head_flags, keys, prev_keys, flag_op, tile_predecessor); } // Zip values and head flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { scan_items[ITEM].value = values[ITEM]; scan_items[ITEM].key = head_flags[ITEM]; } // Perform exclusive tile scan // Inclusive block-wide scan aggregate OffsetValuePairT block_aggregate; // Number of segments prior to this tile OffsetT num_segments_prefix; // The tile prefix folded with block_aggregate OffsetValuePairT total_aggregate; if (tile_idx == 0) { // Scan first tile BlockScanT(temp_storage.scan_storage.scan) .ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); num_segments_prefix = 0; total_aggregate = block_aggregate; // Update tile status if there are successor tiles if ((!IS_LAST_TILE) && (threadIdx.x == 0)) { tile_state.SetInclusive(0, block_aggregate); } } else { // Scan non-first tile TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); BlockScanT(temp_storage.scan_storage.scan) .ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); block_aggregate = prefix_op.GetBlockAggregate(); num_segments_prefix = prefix_op.GetExclusivePrefix().key; total_aggregate = prefix_op.GetInclusivePrefix(); } // Rezip scatter items and segment indices #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { scatter_items[ITEM].key = prev_keys[ITEM]; scatter_items[ITEM].value = scan_items[ITEM].value; segment_indices[ITEM] = scan_items[ITEM].key; } // At this point, each flagged segment head has: // - The key for the previous segment // - The reduced value from the previous segment // - The segment index for the reduced value // Scatter flagged keys and values OffsetT num_tile_segments = block_aggregate.key; Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); // Last thread in last tile will output final count (and last pair, if // necessary) if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) { OffsetT num_segments = num_segments_prefix + num_tile_segments; // If the last tile is a whole tile, output the final_value if (num_remaining == TILE_ITEMS) { d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; d_aggregates_out[num_segments] = total_aggregate.value; num_segments++; } // Output the total number of items selected *d_num_runs_out = num_segments; } } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param tile_state * Global tile state descriptor * * @param start_tile * The starting tile for the current grid */ __device__ __forceinline__ void ConsumeRange(OffsetT num_items, ScanTileStateT &tile_state, int start_tile) { // Blocks are launched in increasing order, so just assign one tile per // block // Current tile index int tile_idx = start_tile + blockIdx.x; // Global offset for the current tile OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Remaining items (including this tile) OffsetT num_remaining = num_items - tile_offset; if (num_remaining > TILE_ITEMS) { // Not last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } else if (num_remaining > 0) { // Last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_rle.cuh000066400000000000000000001047011434614775400166150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode. */ #pragma once #include #include "single_pass_scan_operators.cuh" #include "../block/block_load.cuh" #include "../block/block_store.cuh" #include "../block/block_scan.cuh" #include "../block/block_exchange.cuh" #include "../block/block_discontinuity.cuh" #include "../config.cuh" #include "../grid/grid_queue.cuh" #include "../iterator/cache_modified_input_iterator.cuh" #include "../iterator/constant_input_iterator.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentRle */ template < int _BLOCK_THREADS, ///< Threads per thread block int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use struct AgentRlePolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) }; static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode */ template < typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type typename InputIteratorT, ///< Random-access input iterator type for data typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values typename EqualityOpT, ///< T equality operator type typename OffsetT> ///< Signed integer type for global offsets struct AgentRle { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The input value type using T = cub::detail::value_t; /// The lengths output value type using LengthT = cub::detail::non_void_value_t; /// Tuple type for scanning (pairs run-length and run-index) using LengthOffsetPair = KeyValuePair; /// Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; // Constants enum { WARP_THREADS = CUB_WARP_THREADS(0), BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, /// Whether or not to sync after loading data SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, }; /** * Special operator that signals all out-of-bounds items are not equal to everything else, * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked * trivial. */ template struct OobInequalityOp { OffsetT num_remaining; EqualityOpT equality_op; __device__ __forceinline__ OobInequalityOp( OffsetT num_remaining, EqualityOpT equality_op) : num_remaining(num_remaining), equality_op(equality_op) {} template __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx) { if (!LAST_TILE || (idx < num_remaining)) return !equality_op(first, second); else return true; } }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for data // Wrap the native input pointer with CacheModifiedVLengthnputIterator // Directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, InputIteratorT>; // Parameterized BlockLoad type for data using BlockLoadT = BlockLoad; // Parameterized BlockDiscontinuity type for data using BlockDiscontinuityT = BlockDiscontinuity ; // Parameterized WarpScan type using WarpScanPairs = WarpScan; // Reduce-length-by-run scan operator using ReduceBySegmentOpT = ReduceBySegmentOp; // Callback type for obtaining tile prefix during block scan using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Warp exchange types using WarpExchangePairs = WarpExchange; using WarpExchangePairsStorage = cub::detail::conditional_t; using WarpExchangeOffsets = WarpExchange; using WarpExchangeLengths = WarpExchange; typedef LengthOffsetPair WarpAggregates[WARPS]; // Shared memory type for this thread block struct _TempStorage { // Aliasable storage layout union Aliasable { struct ScanStorage { typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback } scan_storage; // Smem needed for input loading typename BlockLoadT::TempStorage load; // Aliasable layout needed for two-phase scatter union ScatterAliasable { unsigned long long align; WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; } scatter_aliasable; } aliasable; OffsetT tile_idx; // Shared tile index LengthOffsetPair tile_inclusive; // Inclusive tile prefix LengthOffsetPair tile_exclusive; // Exclusive tile prefix }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets LengthsOutputIteratorT d_lengths_out; ///< Output run lengths EqualityOpT equality_op; ///< T equality operator ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor __device__ __forceinline__ AgentRle( TempStorage &temp_storage, ///< [in] Reference to temp_storage InputIteratorT d_in, ///< [in] Pointer to input sequence of data items OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths EqualityOpT equality_op, ///< [in] T equality operator OffsetT num_items) ///< [in] Total number of input items : temp_storage(temp_storage.Alias()), d_in(d_in), d_offsets_out(d_offsets_out), d_lengths_out(d_lengths_out), equality_op(equality_op), scan_op(cub::Sum()), num_items(num_items) {} //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- template __device__ __forceinline__ void InitializeSelections( OffsetT tile_offset, OffsetT num_remaining, T (&items)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) { bool head_flags[ITEMS_PER_THREAD]; bool tail_flags[ITEMS_PER_THREAD]; OobInequalityOp inequality_op(num_remaining, equality_op); if (FIRST_TILE && LAST_TILE) { // First-and-last-tile always head-flags the first item and tail-flags the last item BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails( head_flags, tail_flags, items, inequality_op); } else if (FIRST_TILE) { // First-tile always head-flags the first item // Get the first item from the next tile T tile_successor_item; if (threadIdx.x == BLOCK_THREADS - 1) tile_successor_item = d_in[tile_offset + TILE_ITEMS]; BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails( head_flags, tail_flags, tile_successor_item, items, inequality_op); } else if (LAST_TILE) { // Last-tile always flags the last item // Get the last item from the previous tile T tile_predecessor_item; if (threadIdx.x == 0) tile_predecessor_item = d_in[tile_offset - 1]; BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails( head_flags, tile_predecessor_item, tail_flags, items, inequality_op); } else { // Get the first item from the next tile T tile_successor_item; if (threadIdx.x == BLOCK_THREADS - 1) tile_successor_item = d_in[tile_offset + TILE_ITEMS]; // Get the last item from the previous tile T tile_predecessor_item; if (threadIdx.x == 0) tile_predecessor_item = d_in[tile_offset - 1]; BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails( head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); } // Zip counts and runs #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); } } //--------------------------------------------------------------------- // Scan utility methods //--------------------------------------------------------------------- /** * Scan of allocations */ __device__ __forceinline__ void WarpScanAllocations( LengthOffsetPair &tile_aggregate, LengthOffsetPair &warp_aggregate, LengthOffsetPair &warp_exclusive_in_tile, LengthOffsetPair &thread_exclusive_in_warp, LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) { // Perform warpscans unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); LengthOffsetPair identity; identity.key = 0; identity.value = 0; LengthOffsetPair thread_inclusive; LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); WarpScanPairs(temp_storage.aliasable.scan_storage.warp_scan[warp_id]).Scan( thread_aggregate, thread_inclusive, thread_exclusive_in_warp, identity, scan_op); // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive; CTA_SYNC(); // Accumulate total selected and the warp-wide prefix warp_exclusive_in_tile = identity; warp_aggregate = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id]; tile_aggregate = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[0]; #pragma unroll for (int WARP = 1; WARP < WARPS; ++WARP) { if (warp_id == WARP) warp_exclusive_in_tile = tile_aggregate; tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[WARP]); } } //--------------------------------------------------------------------- // Utility methods for scattering selections //--------------------------------------------------------------------- /** * Two-phase scatter, specialized for warp time-slicing */ template __device__ __forceinline__ void ScatterTwoPhase( OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); // Locally compact items within the warp (first warp) if (warp_id == 0) { WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( lengths_and_offsets, thread_num_runs_exclusive_in_warp); } // Locally compact items within the warp (remaining warps) #pragma unroll for (int SLICE = 1; SLICE < WARPS; ++SLICE) { CTA_SYNC(); if (warp_id == SLICE) { WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( lengths_and_offsets, thread_num_runs_exclusive_in_warp); } } // Global scatter #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) { OffsetT item_offset = tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + (ITEM * WARP_THREADS) + lane_id; // Scatter offset d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; // Scatter length if not the first (global) length if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) { d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; } } } } /** * Two-phase scatter */ template __device__ __forceinline__ void ScatterTwoPhase( OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); // Unzip OffsetT run_offsets[ITEMS_PER_THREAD]; LengthT run_lengths[ITEMS_PER_THREAD]; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { run_offsets[ITEM] = lengths_and_offsets[ITEM].key; run_lengths[ITEM] = lengths_and_offsets[ITEM].value; } WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped( run_offsets, thread_num_runs_exclusive_in_warp); WARP_SYNC(0xffffffff); WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped( run_lengths, thread_num_runs_exclusive_in_warp); // Global scatter #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) { OffsetT item_offset = tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + (ITEM * WARP_THREADS) + lane_id; // Scatter offset d_offsets_out[item_offset] = run_offsets[ITEM]; // Scatter length if not the first (global) length if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) { d_lengths_out[item_offset - 1] = run_lengths[ITEM]; } } } } /** * Direct scatter */ template __device__ __forceinline__ void ScatterDirect( OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) { OffsetT item_offset = tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + thread_num_runs_exclusive_in_warp[ITEM]; // Scatter offset d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; // Scatter length if not the first (global) length if (item_offset >= 1) { d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; } } } } /** * Scatter */ template __device__ __forceinline__ void Scatter( OffsetT tile_num_runs_aggregate, OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) { if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) { // Direct scatter if the warp has any items if (warp_num_runs_aggregate) { ScatterDirect( tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets); } } else { // Scatter two phase ScatterTwoPhase( tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets, Int2Type()); } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process a tile of input (dynamic chained scan) */ template < bool LAST_TILE> __device__ __forceinline__ LengthOffsetPair ConsumeTile( OffsetT num_items, ///< Total number of global input items OffsetT num_remaining, ///< Number of global input items remaining (including this tile) int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT &tile_status) ///< Global list of tile status { if (tile_idx == 0) { // First tile // Load items T items[ITEMS_PER_THREAD]; if (LAST_TILE) BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); else BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); if (SYNC_AFTER_LOAD) CTA_SYNC(); // Set flags LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; InitializeSelections( tile_offset, num_remaining, items, lengths_and_num_runs); // Exclusive scan of lengths and runs LengthOffsetPair tile_aggregate; LengthOffsetPair warp_aggregate; LengthOffsetPair warp_exclusive_in_tile; LengthOffsetPair thread_exclusive_in_warp; WarpScanAllocations( tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs); // Update tile status if this is not the last tile if (!LAST_TILE && (threadIdx.x == 0)) tile_status.SetInclusive(0, tile_aggregate); // Update thread_exclusive_in_warp to fold in warp run-length if (thread_exclusive_in_warp.key == 0) thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; // Downsweep scan through lengths_and_num_runs internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); // Zip #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? lengths_and_num_runs2[ITEM].key : // keep WARP_THREADS * ITEMS_PER_THREAD; // discard } OffsetT tile_num_runs_aggregate = tile_aggregate.key; OffsetT tile_num_runs_exclusive_in_global = 0; OffsetT warp_num_runs_aggregate = warp_aggregate.key; OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; // Scatter Scatter( tile_num_runs_aggregate, tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets); // Return running total (inclusive of this tile) return tile_aggregate; } else { // Not first tile // Load items T items[ITEMS_PER_THREAD]; if (LAST_TILE) BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); else BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); if (SYNC_AFTER_LOAD) CTA_SYNC(); // Set flags LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; InitializeSelections( tile_offset, num_remaining, items, lengths_and_num_runs); // Exclusive scan of lengths and runs LengthOffsetPair tile_aggregate; LengthOffsetPair warp_aggregate; LengthOffsetPair warp_exclusive_in_tile; LengthOffsetPair thread_exclusive_in_warp; WarpScanAllocations( tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs); // First warp computes tile prefix in lane 0 TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.scan_storage.prefix, Sum(), tile_idx); unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); if (warp_id == 0) { prefix_op(tile_aggregate); if (threadIdx.x == 0) temp_storage.tile_exclusive = prefix_op.exclusive_prefix; } CTA_SYNC(); LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; // Update thread_exclusive_in_warp to fold in warp and tile run-lengths LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); if (thread_exclusive_in_warp.key == 0) thread_exclusive_in_warp.value += thread_exclusive.value; // Downsweep scan through lengths_and_num_runs LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); // Zip #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? lengths_and_num_runs2[ITEM].key : // keep WARP_THREADS * ITEMS_PER_THREAD; // discard } OffsetT tile_num_runs_aggregate = tile_aggregate.key; OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; OffsetT warp_num_runs_aggregate = warp_aggregate.key; OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; // Scatter Scatter( tile_num_runs_aggregate, tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets); // Return running total (inclusive of this tile) return prefix_op.inclusive_prefix; } } /** * Scan tiles of items as part of a dynamic chained scan */ template ///< Output iterator type for recording number of items selected __device__ __forceinline__ void ConsumeRange( int num_tiles, ///< Total number of input tiles ScanTileStateT& tile_status, ///< Global list of tile status NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) if (tile_idx < num_tiles - 1) { // Not the last tile (full) ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); } else if (num_remaining > 0) { // The last tile (possibly partially-full) LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); if (threadIdx.x == 0) { // Output the total number of items selected *d_num_runs_out = running_total.key; // The inclusive prefix contains accumulated length reduction for the last run if (running_total.key > 0) d_lengths_out[running_total.key - 1] = running_total.value; } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_scan.cuh000066400000000000000000000444731434614775400167700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::AgentScan implements a stateful abstraction of CUDA thread blocks * for participating in device-wide prefix scan . */ #pragma once #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentScan * * @tparam NOMINAL_BLOCK_THREADS_4B * Threads per thread block * * @tparam NOMINAL_ITEMS_PER_THREAD_4B * Items per thread (per tile of input) * * @tparam ComputeT * Dominant compute type * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _STORE_ALGORITHM * The BlockStore algorithm to use * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * */ template > struct AgentScanPolicy : ScalingType { static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentScan implements a stateful abstraction of CUDA thread blocks for * participating in device-wide prefix scan. * @tparam AgentScanPolicyT * Parameterized AgentScanPolicyT tuning policy type * * @tparam InputIteratorT * Random-access input iterator type * * @tparam OutputIteratorT * Random-access output iterator type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * */ template struct AgentScan { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input value type using InputT = cub::detail::value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; // Input iterator wrapper type (for applying cache modifier) // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, InputIteratorT>; // Constants enum { // Inclusive scan if no init_value type is provided IS_INCLUSIVE = std::is_same::value, BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; // Parameterized BlockLoad type typedef BlockLoad BlockLoadT; // Parameterized BlockStore type typedef BlockStore BlockStoreT; // Parameterized BlockScan type typedef BlockScan BlockScanT; // Callback type for obtaining tile prefix during block scan typedef TilePrefixCallbackOp TilePrefixCallbackOpT; // Stateful BlockScan prefix callback type for managing a running total while // scanning consecutive tiles typedef BlockScanRunningPrefixOp RunningPrefixCallbackOp; // Shared memory type for this thread block union _TempStorage { // Smem needed for tile loading typename BlockLoadT::TempStorage load; // Smem needed for tile storing typename BlockStoreT::TempStorage store; struct ScanStorage { // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for tile scanning typename BlockScanT::TempStorage scan; } scan_storage; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage &temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input data OutputIteratorT d_out; ///< Output data ScanOpT scan_op; ///< Binary scan operator InitValueT init_value; ///< The init_value element for ScanOpT //--------------------------------------------------------------------- // Block scan utility methods //--------------------------------------------------------------------- /** * Exclusive scan specialization (first tile) */ __device__ __forceinline__ void ScanTile(AccumT (&items)[ITEMS_PER_THREAD], AccumT init_value, ScanOpT scan_op, AccumT &block_aggregate, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan) .ExclusiveScan(items, items, init_value, scan_op, block_aggregate); block_aggregate = scan_op(init_value, block_aggregate); } /** * Inclusive scan specialization (first tile) */ __device__ __forceinline__ void ScanTile(AccumT (&items)[ITEMS_PER_THREAD], InitValueT /*init_value*/, ScanOpT scan_op, AccumT &block_aggregate, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan) .InclusiveScan(items, items, scan_op, block_aggregate); } /** * Exclusive scan specialization (subsequent tiles) */ template __device__ __forceinline__ void ScanTile(AccumT (&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback &prefix_op, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan) .ExclusiveScan(items, items, scan_op, prefix_op); } /** * Inclusive scan specialization (subsequent tiles) */ template __device__ __forceinline__ void ScanTile(AccumT (&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback &prefix_op, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan) .InclusiveScan(items, items, scan_op, prefix_op); } //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param d_in * Input data * * @param d_out * Output data * * @param scan_op * Binary scan operator * * @param init_value * Initial value to seed the exclusive scan */ __device__ __forceinline__ AgentScan(TempStorage &temp_storage, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_out(d_out) , scan_op(scan_op) , init_value(init_value) {} //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process a tile of input (dynamic chained scan) * @tparam IS_LAST_TILE * Whether the current tile is the last tile * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor */ template __device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state) { // Load items AccumT items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last element with the first element because collectives are // not suffix guarded. BlockLoadT(temp_storage.load) .Load(d_in + tile_offset, items, num_remaining, *(d_in + tile_offset)); } else { BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } CTA_SYNC(); // Perform tile scan if (tile_idx == 0) { // Scan first tile AccumT block_aggregate; ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); if ((!IS_LAST_TILE) && (threadIdx.x == 0)) { tile_state.SetInclusive(0, block_aggregate); } } else { // Scan non-first tile TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); ScanTile(items, scan_op, prefix_op, Int2Type()); } CTA_SYNC(); // Store items if (IS_LAST_TILE) { BlockStoreT(temp_storage.store) .Store(d_out + tile_offset, items, num_remaining); } else { BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); } } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param tile_state * Global tile state descriptor * * @param start_tile * The starting tile for the current grid */ __device__ __forceinline__ void ConsumeRange(OffsetT num_items, ScanTileStateT &tile_state, int start_tile) { // Blocks are launched in increasing order, so just assign one tile per // block // Current tile index int tile_idx = start_tile + blockIdx.x; // Global offset for the current tile OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Remaining items (including this tile) OffsetT num_remaining = num_items - tile_offset; if (num_remaining > TILE_ITEMS) { // Not last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } else if (num_remaining > 0) { // Last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } } //--------------------------------------------------------------------------- // Scan an sequence of consecutive tiles (independent of other thread blocks) //--------------------------------------------------------------------------- /** * @brief Process a tile of input * * @param tile_offset * Tile offset * * @param prefix_op * Running prefix operator * * @param valid_items * Number of valid items in the tile */ template __device__ __forceinline__ void ConsumeTile(OffsetT tile_offset, RunningPrefixCallbackOp &prefix_op, int valid_items = TILE_ITEMS) { // Load items AccumT items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last element with the first element because collectives are // not suffix guarded. BlockLoadT(temp_storage.load) .Load(d_in + tile_offset, items, valid_items, *(d_in + tile_offset)); } else { BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } CTA_SYNC(); // Block scan if (IS_FIRST_TILE) { AccumT block_aggregate; ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); prefix_op.running_total = block_aggregate; } else { ScanTile(items, scan_op, prefix_op, Int2Type()); } CTA_SYNC(); // Store items if (IS_LAST_TILE) { BlockStoreT(temp_storage.store) .Store(d_out + tile_offset, items, valid_items); } else { BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); } } /** * @brief Scan a consecutive share of input tiles * * @param[in] range_offset * Threadblock begin offset (inclusive) * * @param[in] range_end * Threadblock end offset (exclusive) */ __device__ __forceinline__ void ConsumeRange(OffsetT range_offset, OffsetT range_end) { BlockScanRunningPrefixOp prefix_op(scan_op); if (range_offset + TILE_ITEMS <= range_end) { // Consume first tile of input (full) ConsumeTile(range_offset, prefix_op); range_offset += TILE_ITEMS; // Consume subsequent full tiles of input while (range_offset + TILE_ITEMS <= range_end) { ConsumeTile(range_offset, prefix_op); range_offset += TILE_ITEMS; } // Consume a partially-full tile if (range_offset < range_end) { int valid_items = range_end - range_offset; ConsumeTile(range_offset, prefix_op, valid_items); } } else { // Consume the first tile of input (partially-full) int valid_items = range_end - range_offset; ConsumeTile(range_offset, prefix_op, valid_items); } } /** * @brief Scan a consecutive share of input tiles, seeded with the * specified prefix value * @param[in] range_offset * Threadblock begin offset (inclusive) * * @param[in] range_end * Threadblock end offset (exclusive) * * @param[in] prefix * The prefix to apply to the scan segment */ __device__ __forceinline__ void ConsumeRange(OffsetT range_offset, OffsetT range_end, AccumT prefix) { BlockScanRunningPrefixOp prefix_op(prefix, scan_op); // Consume full tiles of input while (range_offset + TILE_ITEMS <= range_end) { ConsumeTile(range_offset, prefix_op); range_offset += TILE_ITEMS; } // Consume a partially-full tile if (range_offset < range_end) { int valid_items = range_end - range_offset; ConsumeTile(range_offset, prefix_op, valid_items); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_scan_by_key.cuh000066400000000000000000000432661434614775400203310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file AgentScanByKey implements a stateful abstraction of CUDA thread blocks * for participating in device-wide prefix scan by key. */ #pragma once #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentScanByKey */ template struct AgentScanByKeyPolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentScanByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide prefix scan by key. * * @tparam AgentScanByKeyPolicyT * Parameterized AgentScanPolicyT tuning policy type * * @tparam KeysInputIteratorT * Random-access input iterator type * * @tparam ValuesInputIteratorT * Random-access input iterator type * * @tparam ValuesOutputIteratorT * Random-access output iterator type * * @tparam EqualityOp * Equality functor type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * */ template struct AgentScanByKey { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- using KeyT = cub::detail::value_t; using InputT = cub::detail::value_t; using SizeValuePairT = KeyValuePair; using KeyValuePairT = KeyValuePair; using ReduceBySegmentOpT = ReduceBySegmentOp; using ScanTileStateT = ReduceByKeyScanTileState; // Constants // Inclusive scan if no init_value type is provided static constexpr int IS_INCLUSIVE = std::is_same::value; static constexpr int BLOCK_THREADS = AgentScanByKeyPolicyT::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentScanByKeyPolicyT::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; using WrappedKeysInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, KeysInputIteratorT>; using WrappedValuesInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, ValuesInputIteratorT>; using BlockLoadKeysT = BlockLoad; using BlockLoadValuesT = BlockLoad; using BlockStoreValuesT = BlockStore; using BlockDiscontinuityKeysT = BlockDiscontinuity; using TilePrefixCallbackT = TilePrefixCallbackOp; using BlockScanT = BlockScan; union TempStorage_ { struct ScanStorage { typename BlockScanT::TempStorage scan; typename TilePrefixCallbackT::TempStorage prefix; typename BlockDiscontinuityKeysT::TempStorage discontinuity; } scan_storage; typename BlockLoadKeysT::TempStorage load_keys; typename BlockLoadValuesT::TempStorage load_values; typename BlockStoreValuesT::TempStorage store_values; }; struct TempStorage : cub::Uninitialized {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- TempStorage_ &storage; WrappedKeysInputIteratorT d_keys_in; KeyT *d_keys_prev_in; WrappedValuesInputIteratorT d_values_in; ValuesOutputIteratorT d_values_out; InequalityWrapper inequality_op; ScanOpT scan_op; ReduceBySegmentOpT pair_scan_op; InitValueT init_value; //--------------------------------------------------------------------- // Block scan utility methods (first tile) //--------------------------------------------------------------------- // Exclusive scan specialization __device__ __forceinline__ void ScanTile(SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT &tile_aggregate, Int2Type /* is_inclusive */) { BlockScanT(storage.scan_storage.scan) .ExclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate); } // Inclusive scan specialization __device__ __forceinline__ void ScanTile(SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT &tile_aggregate, Int2Type /* is_inclusive */) { BlockScanT(storage.scan_storage.scan) .InclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate); } //--------------------------------------------------------------------- // Block scan utility methods (subsequent tiles) //--------------------------------------------------------------------- // Exclusive scan specialization (with prefix from predecessors) __device__ __forceinline__ void ScanTile(SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT &tile_aggregate, TilePrefixCallbackT &prefix_op, Int2Type /* is_incclusive */) { BlockScanT(storage.scan_storage.scan) .ExclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op); tile_aggregate = prefix_op.GetBlockAggregate(); } // Inclusive scan specialization (with prefix from predecessors) __device__ __forceinline__ void ScanTile(SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT &tile_aggregate, TilePrefixCallbackT &prefix_op, Int2Type /* is_inclusive */) { BlockScanT(storage.scan_storage.scan) .InclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op); tile_aggregate = prefix_op.GetBlockAggregate(); } //--------------------------------------------------------------------- // Zip utility methods //--------------------------------------------------------------------- template __device__ __forceinline__ void ZipValuesAndFlags(OffsetT num_remaining, AccumT (&values)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], SizeValuePairT (&scan_items)[ITEMS_PER_THREAD]) { // Zip values and segment_flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set segment_flags for first out-of-bounds item, zero for others if (IS_LAST_TILE && OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining) { segment_flags[ITEM] = 1; } scan_items[ITEM].value = values[ITEM]; scan_items[ITEM].key = segment_flags[ITEM]; } } __device__ __forceinline__ void UnzipValues(AccumT (&values)[ITEMS_PER_THREAD], SizeValuePairT (&scan_items)[ITEMS_PER_THREAD]) { // Zip values and segment_flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { values[ITEM] = scan_items[ITEM].value; } } template ::value, typename std::enable_if::type = 0> __device__ __forceinline__ void AddInitToScan(AccumT (&items)[ITEMS_PER_THREAD], OffsetT (&flags)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { items[ITEM] = flags[ITEM] ? init_value : scan_op(init_value, items[ITEM]); } } template ::value, typename std::enable_if::type = 0> __device__ __forceinline__ void AddInitToScan(AccumT (&/*items*/)[ITEMS_PER_THREAD], OffsetT (&/*flags*/)[ITEMS_PER_THREAD]) {} //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- // Process a tile of input (dynamic chained scan) // template __device__ __forceinline__ void ConsumeTile(OffsetT /*num_items*/, OffsetT num_remaining, int tile_idx, OffsetT tile_base, ScanTileStateT &tile_state) { // Load items KeyT keys[ITEMS_PER_THREAD]; AccumT values[ITEMS_PER_THREAD]; OffsetT segment_flags[ITEMS_PER_THREAD]; SizeValuePairT scan_items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last element with the first element // because collectives are not suffix guarded BlockLoadKeysT(storage.load_keys) .Load(d_keys_in + tile_base, keys, num_remaining, *(d_keys_in + tile_base)); } else { BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys); } CTA_SYNC(); if (IS_LAST_TILE) { // Fill last element with the first element // because collectives are not suffix guarded BlockLoadValuesT(storage.load_values) .Load(d_values_in + tile_base, values, num_remaining, *(d_values_in + tile_base)); } else { BlockLoadValuesT(storage.load_values) .Load(d_values_in + tile_base, values); } CTA_SYNC(); // first tile if (tile_idx == 0) { BlockDiscontinuityKeysT(storage.scan_storage.discontinuity) .FlagHeads(segment_flags, keys, inequality_op); // Zip values and segment_flags ZipValuesAndFlags(num_remaining, values, segment_flags, scan_items); // Exclusive scan of values and segment_flags SizeValuePairT tile_aggregate; ScanTile(scan_items, tile_aggregate, Int2Type()); if (threadIdx.x == 0) { if (!IS_LAST_TILE) { tile_state.SetInclusive(0, tile_aggregate); } scan_items[0].key = 0; } } else { KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx] : KeyT(); BlockDiscontinuityKeysT(storage.scan_storage.discontinuity) .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key); // Zip values and segment_flags ZipValuesAndFlags(num_remaining, values, segment_flags, scan_items); SizeValuePairT tile_aggregate; TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx); ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type()); } CTA_SYNC(); UnzipValues(values, scan_items); AddInitToScan(values, segment_flags); // Store items if (IS_LAST_TILE) { BlockStoreValuesT(storage.store_values) .Store(d_values_out + tile_base, values, num_remaining); } else { BlockStoreValuesT(storage.store_values) .Store(d_values_out + tile_base, values); } } //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Dequeue and scan tiles of items as part of a dynamic chained scan // with Init functor __device__ __forceinline__ AgentScanByKey(TempStorage &storage, KeysInputIteratorT d_keys_in, KeyT *d_keys_prev_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value) : storage(storage.Alias()) , d_keys_in(d_keys_in) , d_keys_prev_in(d_keys_prev_in) , d_values_in(d_values_in) , d_values_out(d_values_out) , inequality_op(equality_op) , scan_op(scan_op) , pair_scan_op(scan_op) , init_value(init_value) {} /** * Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param tile_state * Global tile state descriptor * * start_tile * The starting tile for the current grid */ __device__ __forceinline__ void ConsumeRange(OffsetT num_items, ScanTileStateT &tile_state, int start_tile) { int tile_idx = blockIdx.x; OffsetT tile_base = OffsetT(ITEMS_PER_TILE) * tile_idx; OffsetT num_remaining = num_items - tile_base; if (num_remaining > ITEMS_PER_TILE) { // Not the last tile (full) ConsumeTile(num_items, num_remaining, tile_idx, tile_base, tile_state); } else if (num_remaining > 0) { // The last tile (possibly partially-full) ConsumeTile(num_items, num_remaining, tile_idx, tile_base, tile_state); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_segment_fixup.cuh000066400000000000000000000403141434614775400207070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. */ #pragma once #include #include "single_pass_scan_operators.cuh" #include "../block/block_load.cuh" #include "../block/block_store.cuh" #include "../block/block_scan.cuh" #include "../block/block_discontinuity.cuh" #include "../config.cuh" #include "../iterator/cache_modified_input_iterator.cuh" #include "../iterator/constant_input_iterator.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentSegmentFixup */ template < int _BLOCK_THREADS, ///< Threads per thread block int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use struct AgentSegmentFixupPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) }; static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key */ template < typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type typename PairsInputIteratorT, ///< Random-access input iterator type for keys typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values typename EqualityOpT, ///< KeyT equality operator type typename ReductionOpT, ///< ValueT reduction operator type typename OffsetT> ///< Signed integer type for global offsets struct AgentSegmentFixup { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // Data type of key-value input iterator using KeyValuePairT = cub::detail::value_t; // Value type using ValueT = typename KeyValuePairT::Value; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; // Constants enum { BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, // Whether or not do fixup using RLE + global atomics USE_ATOMIC_FIXUP = (std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value), // Whether or not the scan operation has a zero-valued identity value // (true if we're performing addition on a primitive type) HAS_IDENTITY_ZERO = (std::is_same::value) && (Traits::PRIMITIVE), }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedPairsInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, PairsInputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedFixupInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, AggregatesOutputIteratorT>; // Reduce-value-by-segment scan operator using ReduceBySegmentOpT = ReduceByKeyOp; // Parameterized BlockLoad type for pairs using BlockLoadPairs = BlockLoad; // Parameterized BlockScan type using BlockScanT = BlockScan; // Callback type for obtaining tile prefix during block scan using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { typename BlockScanT::TempStorage scan; // Smem needed for tile scanning typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback } scan_storage; // Smem needed for loading keys typename BlockLoadPairs::TempStorage load_pairs; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedPairsInputIteratorT d_pairs_in; ///< Input keys AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values InequalityWrapper inequality_op; ///< KeyT inequality operator ReductionOpT reduction_op; ///< Reduction operator ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor __device__ __forceinline__ AgentSegmentFixup( TempStorage& temp_storage, ///< Reference to temp_storage PairsInputIteratorT d_pairs_in, ///< Input keys AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates EqualityOpT equality_op, ///< KeyT equality operator ReductionOpT reduction_op) ///< ValueT reduction operator : temp_storage(temp_storage.Alias()), d_pairs_in(d_pairs_in), d_aggregates_out(d_aggregates_out), d_fixup_in(d_aggregates_out), inequality_op(equality_op), reduction_op(reduction_op), scan_op(reduction_op) {} //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process input tile. Specialized for atomic-fixup */ template __device__ __forceinline__ void ConsumeTile( OffsetT num_remaining, ///< Number of global input items remaining (including this tile) int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state, ///< Global tile state descriptor Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) { KeyValuePairT pairs[ITEMS_PER_THREAD]; // Load pairs KeyValuePairT oob_pair; oob_pair.key = -1; if (IS_LAST_TILE) BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); else BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); // RLE #pragma unroll for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) { ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; if (pairs[ITEM].key != pairs[ITEM - 1].key) atomicAdd(d_scatter, pairs[ITEM - 1].value); else pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); } // Flush last item if valid ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); } /** * Process input tile. Specialized for reduce-by-key fixup */ template __device__ __forceinline__ void ConsumeTile( OffsetT num_remaining, ///< Number of global input items remaining (including this tile) int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state, ///< Global tile state descriptor Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) { KeyValuePairT pairs[ITEMS_PER_THREAD]; KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; // Load pairs KeyValuePairT oob_pair; oob_pair.key = -1; if (IS_LAST_TILE) BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); else BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); CTA_SYNC(); KeyValuePairT tile_aggregate; if (tile_idx == 0) { // Exclusive scan of values and segment_flags BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); // Update tile status if this is not the last tile if (threadIdx.x == 0) { // Set first segment id to not trigger a flush (invalid from exclusive scan) scatter_pairs[0].key = pairs[0].key; if (!IS_LAST_TILE) tile_state.SetInclusive(0, tile_aggregate); } } else { // Exclusive scan of values and segment_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); tile_aggregate = prefix_op.GetBlockAggregate(); } // Scatter updated values #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (scatter_pairs[ITEM].key != pairs[ITEM].key) { // Update the value at the key location ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; value = reduction_op(value, scatter_pairs[ITEM].value); d_aggregates_out[scatter_pairs[ITEM].key] = value; } } // Finalize the last item if (IS_LAST_TILE) { // Last thread will output final count and last item, if necessary if (threadIdx.x == BLOCK_THREADS - 1) { // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment if (num_remaining == TILE_ITEMS) { // Update the value at the key location OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); } } } } /** * Scan tiles of items as part of a dynamic chained scan */ __device__ __forceinline__ void ConsumeRange( OffsetT num_items, ///< Total number of input items int num_tiles, ///< Total number of input tiles ScanTileStateT& tile_state) ///< Global tile state descriptor { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) if (num_remaining > TILE_ITEMS) { // Not the last tile (full) ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); } else if (num_remaining > 0) { // The last tile (possibly partially-full) ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_segmented_radix_sort.cuh000066400000000000000000000234471434614775400222530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * This agent will be implementing the `DeviceSegmentedRadixSort` when the * https://github.com/NVIDIA/cub/issues/383 is addressed. * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam SegmentedPolicyT * Chained tuning policy * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentSegmentedRadixSort { OffsetT num_items; static constexpr int ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD; static constexpr int BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS; static constexpr int RADIX_BITS = SegmentedPolicyT::RADIX_BITS; static constexpr int RADIX_DIGITS = 1 << RADIX_BITS; static constexpr int KEYS_ONLY = std::is_same::value; // Huge segment handlers using BlockUpsweepT = AgentRadixSortUpsweep; using DigitScanT = BlockScan; using BlockDownsweepT = AgentRadixSortDownsweep; /// Number of bin-starting offsets tracked per thread static constexpr int BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD; // Small segment handlers using BlockRadixSortT = BlockRadixSort; using BlockKeyLoadT = BlockLoad; using BlockValueLoadT = BlockLoad; union _TempStorage { // Huge segment handlers typename BlockUpsweepT::TempStorage upsweep; typename BlockDownsweepT::TempStorage downsweep; struct UnboundBlockSort { OffsetT reverse_counts_in[RADIX_DIGITS]; OffsetT reverse_counts_out[RADIX_DIGITS]; typename DigitScanT::TempStorage scan; } unbound_sort; // Small segment handlers typename BlockKeyLoadT::TempStorage keys_load; typename BlockValueLoadT::TempStorage values_load; typename BlockRadixSortT::TempStorage sort; }; using TempStorage = Uninitialized<_TempStorage>; _TempStorage &temp_storage; __device__ __forceinline__ AgentSegmentedRadixSort(OffsetT num_items, TempStorage &temp_storage) : num_items(num_items) , temp_storage(temp_storage.Alias()) {} __device__ __forceinline__ void ProcessSinglePass(int begin_bit, int end_bit, const KeyT *d_keys_in, const ValueT *d_values_in, KeyT *d_keys_out, ValueT *d_values_out) { KeyT thread_keys[ITEMS_PER_THREAD]; ValueT thread_values[ITEMS_PER_THREAD]; // For FP64 the difference is: // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b using UnsignedBitsT = typename Traits::UnsignedBits; UnsignedBitsT default_key_bits = IS_DESCENDING ? Traits::LOWEST_KEY : Traits::MAX_KEY; KeyT oob_default = reinterpret_cast(default_key_bits); if (!KEYS_ONLY) { BlockValueLoadT(temp_storage.values_load) .Load(d_values_in, thread_values, num_items); CTA_SYNC(); } { BlockKeyLoadT(temp_storage.keys_load) .Load(d_keys_in, thread_keys, num_items, oob_default); CTA_SYNC(); } BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( thread_keys, thread_values, begin_bit, end_bit, Int2Type(), Int2Type()); cub::StoreDirectStriped( threadIdx.x, d_keys_out, thread_keys, num_items); if (!KEYS_ONLY) { cub::StoreDirectStriped( threadIdx.x, d_values_out, thread_values, num_items); } } __device__ __forceinline__ void ProcessIterative(int current_bit, int pass_bits, const KeyT *d_keys_in, const ValueT *d_values_in, KeyT *d_keys_out, ValueT *d_values_out) { // Upsweep BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); upsweep.ProcessRegion(OffsetT{}, num_items); CTA_SYNC(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); CTA_SYNC(); if (IS_DESCENDING) { // Reverse bin counts #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { temp_storage.unbound_sort.reverse_counts_in[bin_idx] = bin_count[track]; } } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_count[track] = temp_storage.unbound_sort.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; } } } // Scan // The global scatter base offset for each digit value in this pass // (valid in the first RADIX_DIGITS threads) OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; DigitScanT(temp_storage.unbound_sort.scan).ExclusiveSum(bin_count, bin_offset); if (IS_DESCENDING) { // Reverse bin offsets #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { temp_storage.unbound_sort.reverse_counts_out[threadIdx.x] = bin_offset[track]; } } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_offset[track] = temp_storage.unbound_sort.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; } } } CTA_SYNC(); // Downsweep BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); downsweep.ProcessRegion(OffsetT{}, num_items); } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_select_if.cuh000066400000000000000000000703751434614775400200010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. */ #pragma once #include #include "single_pass_scan_operators.cuh" #include "../block/block_load.cuh" #include "../block/block_store.cuh" #include "../block/block_scan.cuh" #include "../block/block_exchange.cuh" #include "../block/block_discontinuity.cuh" #include "../config.cuh" #include "../grid/grid_queue.cuh" #include "../iterator/cache_modified_input_iterator.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentSelectIf */ template < int _BLOCK_THREADS, ///< Threads per thread block int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use struct AgentSelectIfPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) }; static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection * * Performs functor-based selection if SelectOpT functor type != NullType * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) */ template < typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type typename InputIteratorT, ///< Random-access input iterator type for selection items typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) typename SelectedOutputIteratorT, ///< Random-access output iterator type for selection_flags items typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) typename OffsetT, ///< Signed integer type for global offsets bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output struct AgentSelectIf { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input value type using InputT = cub::detail::value_t; // The flag value type using FlagT = cub::detail::value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; // Constants enum { USE_SELECT_OP, USE_SELECT_FLAGS, USE_DISCONTINUITY, BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), SELECT_METHOD = (!std::is_same::value) ? USE_SELECT_OP : (!std::is_same::value) ? USE_SELECT_FLAGS : USE_DISCONTINUITY }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for items // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, InputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) for values // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedFlagsInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, FlagsInputIteratorT>; // Parameterized BlockLoad type for input data using BlockLoadT = BlockLoad; // Parameterized BlockLoad type for flags using BlockLoadFlags = BlockLoad; // Parameterized BlockDiscontinuity type for items using BlockDiscontinuityT = BlockDiscontinuity; // Parameterized BlockScan type using BlockScanT = BlockScan; // Callback type for obtaining tile prefix during block scan using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Item exchange type typedef InputT ItemExchangeT[TILE_ITEMS]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { typename BlockScanT::TempStorage scan; // Smem needed for tile scanning typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection } scan_storage; // Smem needed for loading items typename BlockLoadT::TempStorage load_items; // Smem needed for loading values typename BlockLoadFlags::TempStorage load_flags; // Smem needed for compacting items (allows non POD items in this union) Uninitialized raw_exchange; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input items SelectedOutputIteratorT d_selected_out; ///< Unique output items WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) InequalityWrapper inequality_op; ///< T inequality operator SelectOpT select_op; ///< Selection operator OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor __device__ __forceinline__ AgentSelectIf( TempStorage &temp_storage, ///< Reference to temp_storage InputIteratorT d_in, ///< Input data FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) SelectedOutputIteratorT d_selected_out, ///< Output data SelectOpT select_op, ///< Selection operator EqualityOpT equality_op, ///< Equality operator OffsetT num_items) ///< Total number of input items : temp_storage(temp_storage.Alias()), d_in(d_in), d_flags_in(d_flags_in), d_selected_out(d_selected_out), select_op(select_op), inequality_op(equality_op), num_items(num_items) {} //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- /** * Initialize selections (specialized for selection operator) */ template __device__ __forceinline__ void InitializeSelections( OffsetT /*tile_offset*/, OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags selection_flags[ITEM] = 1; if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) selection_flags[ITEM] = select_op(items[ITEM]); } } /** * Initialize selections (specialized for valid flags) */ template __device__ __forceinline__ void InitializeSelections( OffsetT tile_offset, OffsetT num_tile_items, InputT (&/*items*/)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { CTA_SYNC(); FlagT flags[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Out-of-bounds items are selection_flags BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); } else { BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); } // Convert flag type to selection_flags type #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { selection_flags[ITEM] = static_cast(flags[ITEM]); } } /** * Initialize selections (specialized for discontinuity detection) */ template __device__ __forceinline__ void InitializeSelections( OffsetT tile_offset, OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { if (IS_FIRST_TILE) { CTA_SYNC(); // Set head selection_flags. First tile sets the first flag for the first item BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); } else { InputT tile_predecessor; if (threadIdx.x == 0) tile_predecessor = d_in[tile_offset - 1]; CTA_SYNC(); BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor); } // Set selection flags for out-of-bounds items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) selection_flags[ITEM] = 1; } } //--------------------------------------------------------------------- // Scatter utility methods //--------------------------------------------------------------------- /** * Scatter flagged items to output offsets (specialized for direct scattering) */ template __device__ __forceinline__ void ScatterDirect( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], OffsetT num_selections) { // Scatter flagged items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) { d_selected_out[selection_indices[ITEM]] = items[ITEM]; } } } } /** * Scatter flagged items to output offsets (specialized for two-phase scattering) */ template __device__ __forceinline__ void ScatterTwoPhase( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int /*num_tile_items*/, ///< Number of valid items in this tile int num_tile_selections, ///< Number of selections in this tile OffsetT num_selections_prefix, ///< Total number of selections prior to this tile OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition { CTA_SYNC(); // Compact and scatter items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; if (selection_flags[ITEM]) { temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; } } CTA_SYNC(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; } } /** * Scatter flagged items to output offsets (specialized for two-phase scattering) */ template __device__ __forceinline__ void ScatterTwoPhase( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int num_tile_items, ///< Number of valid items in this tile int num_tile_selections, ///< Number of selections in this tile OffsetT num_selections_prefix, ///< Total number of selections prior to this tile OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition { CTA_SYNC(); int tile_num_rejections = num_tile_items - num_tile_selections; // Scatter items to shared memory (rejections first) #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; int local_rejection_idx = item_idx - local_selection_idx; int local_scatter_offset = (selection_flags[ITEM]) ? tile_num_rejections + local_selection_idx : local_rejection_idx; temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; } CTA_SYNC(); // Gather items from shared memory and scatter to global #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; int rejection_idx = item_idx; int selection_idx = item_idx - tile_num_rejections; OffsetT scatter_offset = (item_idx < tile_num_rejections) ? num_items - num_rejected_prefix - rejection_idx - 1 : num_selections_prefix + selection_idx; InputT item = temp_storage.raw_exchange.Alias()[item_idx]; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { d_selected_out[scatter_offset] = item; } } } /** * Scatter flagged items */ template __device__ __forceinline__ void Scatter( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int num_tile_items, ///< Number of valid items in this tile int num_tile_selections, ///< Number of selections in this tile OffsetT num_selections_prefix, ///< Total number of selections prior to this tile OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile OffsetT num_selections) ///< Total number of selections including this tile { // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) { ScatterTwoPhase( items, selection_flags, selection_indices, num_tile_items, num_tile_selections, num_selections_prefix, num_rejected_prefix, Int2Type()); } else { ScatterDirect( items, selection_flags, selection_indices, num_selections); } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) */ template __device__ __forceinline__ OffsetT ConsumeFirstTile( int num_tile_items, ///< Number of input items comprising this tile OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { InputT items[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); else BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); // Initialize selection_flags InitializeSelections( tile_offset, num_tile_items, items, selection_flags, Int2Type()); CTA_SYNC(); // Exclusive scan of selection_flags OffsetT num_tile_selections; BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) tile_state.SetInclusive(0, num_tile_selections); } // Discount any out-of-bounds selections if (IS_LAST_TILE) num_tile_selections -= (TILE_ITEMS - num_tile_items); // Scatter flagged items Scatter( items, selection_flags, selection_indices, num_tile_items, num_tile_selections, 0, 0, num_tile_selections); return num_tile_selections; } /** * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) */ template __device__ __forceinline__ OffsetT ConsumeSubsequentTile( int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { InputT items[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); else BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); // Initialize selection_flags InitializeSelections( tile_offset, num_tile_items, items, selection_flags, Int2Type()); CTA_SYNC(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); OffsetT num_selections = prefix_op.GetInclusivePrefix(); OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix; // Discount any out-of-bounds selections if (IS_LAST_TILE) { int num_discount = TILE_ITEMS - num_tile_items; num_selections -= num_discount; num_tile_selections -= num_discount; } // Scatter flagged items Scatter( items, selection_flags, selection_indices, num_tile_items, num_tile_selections, num_selections_prefix, num_rejected_prefix, num_selections); return num_selections; } /** * Process a tile of input */ template __device__ __forceinline__ OffsetT ConsumeTile( int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { OffsetT num_selections; if (tile_idx == 0) { num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); } else { num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); } return num_selections; } /** * Scan tiles of items as part of a dynamic chained scan */ template ///< Output iterator type for recording number of items selection_flags __device__ __forceinline__ void ConsumeRange( int num_tiles, ///< Total number of input tiles ScanTileStateT& tile_state, ///< Global tile state descriptor NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile if (tile_idx < num_tiles - 1) { // Not the last tile (full) ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); } else { // The last tile (possibly partially-full) OffsetT num_remaining = num_items - tile_offset; OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); if (threadIdx.x == 0) { // Output the total number of items selection_flags *d_num_selected_out = num_selections; } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_spmv_orig.cuh000066400000000000000000000663421434614775400200500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. */ #pragma once #include #include "../util_type.cuh" #include "../block/block_reduce.cuh" #include "../block/block_scan.cuh" #include "../block/block_exchange.cuh" #include "../config.cuh" #include "../thread/thread_search.cuh" #include "../thread/thread_operators.cuh" #include "../iterator/cache_modified_input_iterator.cuh" #include "../iterator/counting_input_iterator.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy ******************************************************************************/ /** * Parameterizable tuning policy type for AgentSpmv */ template < int _BLOCK_THREADS, ///< Threads per thread block int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use struct AgentSpmvPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) }; static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ template < typename ValueT, ///< Matrix and vector value type typename OffsetT> ///< Signed integer type for sequence offsets struct SpmvParams { const ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. const OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values const OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) const ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y int num_rows; ///< Number of rows of matrix A. int num_cols; ///< Number of columns of matrix A. int num_nonzeros; ///< Number of nonzero elements of matrix A. ValueT alpha; ///< Alpha multiplicand ValueT beta; ///< Beta addend-multiplicand }; /** * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. */ template < typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type typename ValueT, ///< Matrix and vector value type typename OffsetT, ///< Signed integer type for sequence offsets bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 bool HAS_BETA, ///< Whether the input parameter \p beta is 0 int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused) struct AgentSpmv { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// Constants enum { BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; /// 2D merge path coordinate type typedef typename CubVector::Type CoordinateT; /// Input iterator wrapper types (for applying cache modifiers) typedef CacheModifiedInputIterator< AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT> RowOffsetsSearchIteratorT; typedef CacheModifiedInputIterator< AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, OffsetT, OffsetT> RowOffsetsIteratorT; typedef CacheModifiedInputIterator< AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, OffsetT, OffsetT> ColumnIndicesIteratorT; typedef CacheModifiedInputIterator< AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, ValueT, OffsetT> ValueIteratorT; typedef CacheModifiedInputIterator< AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT> VectorValueIteratorT; // Tuple type for scanning (pairs accumulated segment-value with segment-index) typedef KeyValuePair KeyValuePairT; // Reduce-value-by-segment scan operator typedef ReduceByKeyOp ReduceBySegmentOpT; // BlockReduce specialization typedef BlockReduce< ValueT, BLOCK_THREADS, BLOCK_REDUCE_WARP_REDUCTIONS> BlockReduceT; // BlockScan specialization typedef BlockScan< KeyValuePairT, BLOCK_THREADS, AgentSpmvPolicyT::SCAN_ALGORITHM> BlockScanT; // BlockScan specialization typedef BlockScan< ValueT, BLOCK_THREADS, AgentSpmvPolicyT::SCAN_ALGORITHM> BlockPrefixSumT; // BlockExchange specialization typedef BlockExchange< ValueT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeT; /// Merge item type (either a non-zero value or a row-end offset) union MergeItem { // Value type to pair with index type OffsetT // (NullType if loading values directly during merge) using MergeValueT = cub::detail::conditional_t< AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>; OffsetT row_end_offset; MergeValueT nonzero; }; /// Shared memory type required by this thread block struct _TempStorage { CoordinateT tile_coords[2]; union Aliasable { // Smem needed for tile of merge items MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; // Smem needed for block exchange typename BlockExchangeT::TempStorage exchange; // Smem needed for block-wide reduction typename BlockReduceT::TempStorage reduce; // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for tile prefix sum typename BlockPrefixSumT::TempStorage prefix_sum; } aliasable; }; /// Temporary storage type (unionable) struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; /// Reference to temp_storage SpmvParams& spmv_params; ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * Constructor */ __device__ __forceinline__ AgentSpmv( TempStorage& temp_storage, ///< Reference to temp_storage SpmvParams& spmv_params) ///< SpMV input parameter bundle : temp_storage(temp_storage.Alias()), spmv_params(spmv_params), wd_values(spmv_params.d_values), wd_row_end_offsets(spmv_params.d_row_end_offsets), wd_column_indices(spmv_params.d_column_indices), wd_vector_x(spmv_params.d_vector_x), wd_vector_y(spmv_params.d_vector_y) {} /** * Consume a merge tile, specialized for direct-load of nonzeros */ __device__ __forceinline__ KeyValuePairT ConsumeTile( int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch { int tile_num_rows = tile_end_coord.x - tile_start_coord.x; int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; // Gather the row end-offsets for the merge tile into shared memory for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { const OffsetT offset = (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } CTA_SYNC(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); CoordinateT thread_start_coord; MergePathSearch( OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal s_tile_row_end_offsets, // List A tile_nonzero_indices, // List B tile_num_rows, tile_num_nonzeros, thread_start_coord); CTA_SYNC(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; KeyValuePairT scan_segment[ITEMS_PER_THREAD]; ValueT running_total = 0.0; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); OffsetT column_idx = wd_column_indices[nonzero_idx]; ValueT value = wd_values[nonzero_idx]; ValueT vector_value = wd_vector_x[column_idx]; ValueT nonzero = value * vector_value; OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) { // Move down (accumulate) running_total += nonzero; scan_segment[ITEM].value = running_total; scan_segment[ITEM].key = tile_num_rows; ++thread_current_coord.y; } else { // Move right (reset) scan_segment[ITEM].value = running_total; scan_segment[ITEM].key = thread_current_coord.x; running_total = 0.0; ++thread_current_coord.x; } } CTA_SYNC(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; ReduceBySegmentOpT scan_op; KeyValuePairT scan_item; scan_item.value = running_total; scan_item.key = thread_current_coord.x; BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); if (tile_num_rows > 0) { if (threadIdx.x == 0) scan_item.key = -1; // Direct scatter #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (scan_segment[ITEM].key < tile_num_rows) { if (scan_item.key == scan_segment[ITEM].key) scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; if (HAS_ALPHA) { scan_segment[ITEM].value *= spmv_params.alpha; } if (HAS_BETA) { // Update the output vector element ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; scan_segment[ITEM].value += addend; } // Set the output vector element spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; } } } // Return the tile's running carry-out return tile_carry; } /** * Consume a merge tile, specialized for indirect load of nonzeros */ __device__ __forceinline__ KeyValuePairT ConsumeTile( int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch { int tile_num_rows = tile_end_coord.x - tile_start_coord.x; int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; #if (CUB_PTX_ARCH >= 520) OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; // Gather the nonzeros for the merge tile into shared memory #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; ValueT* s = s_tile_nonzeros + nonzero_idx; if (nonzero_idx < tile_num_nonzeros) { OffsetT column_idx = *ci; ValueT value = *a; ValueT vector_value = wd_vector_x[column_idx]; ValueT nonzero = value * vector_value; *s = nonzero; } } #else OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; // Gather the nonzeros for the merge tile into shared memory if (tile_num_nonzeros > 0) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; ValueT vector_value = wd_vector_x[column_idx]; ValueT nonzero = value * vector_value; s_tile_nonzeros[nonzero_idx] = nonzero; } } #endif // Gather the row end-offsets for the merge tile into shared memory #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { const OffsetT offset = (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } CTA_SYNC(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); CoordinateT thread_start_coord; MergePathSearch( OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal s_tile_row_end_offsets, // List A tile_nonzero_indices, // List B tile_num_rows, tile_num_nonzeros, thread_start_coord); CTA_SYNC(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; KeyValuePairT scan_segment[ITEMS_PER_THREAD]; ValueT running_total = 0.0; OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) { // Move down (accumulate) scan_segment[ITEM].value = nonzero; running_total += nonzero; ++thread_current_coord.y; nonzero = s_tile_nonzeros[thread_current_coord.y]; } else { // Move right (reset) scan_segment[ITEM].value = 0.0; running_total = 0.0; ++thread_current_coord.x; row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; } scan_segment[ITEM].key = thread_current_coord.x; } CTA_SYNC(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; ReduceBySegmentOpT scan_op; KeyValuePairT scan_item; scan_item.value = running_total; scan_item.key = thread_current_coord.x; BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); if (threadIdx.x == 0) { scan_item.key = thread_start_coord.x; scan_item.value = 0.0; } if (tile_num_rows > 0) { CTA_SYNC(); // Scan downsweep and scatter ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; if (scan_item.key != scan_segment[0].key) { s_partials[scan_item.key] = scan_item.value; } else { scan_segment[0].value += scan_item.value; } #pragma unroll for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) { s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; } else { scan_segment[ITEM].value += scan_segment[ITEM - 1].value; } } CTA_SYNC(); #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) { spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; } } // Return the tile's running carry-out return tile_carry; } /** * Consume input tile */ __device__ __forceinline__ void ConsumeTile( CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block int num_merge_tiles) ///< [in] Number of merge tiles { int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index if (tile_idx >= num_merge_tiles) return; // Read our starting coordinates if (threadIdx.x < 2) { if (d_tile_coordinates == NULL) { // Search our starting coordinates OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; CoordinateT tile_coord; CountingInputIterator nonzero_indices(0); // Search the merge path MergePathSearch( diagonal, RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), nonzero_indices, spmv_params.num_rows, spmv_params.num_nonzeros, tile_coord); temp_storage.tile_coords[threadIdx.x] = tile_coord; } else { temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; } } CTA_SYNC(); CoordinateT tile_start_coord = temp_storage.tile_coords[0]; CoordinateT tile_end_coord = temp_storage.tile_coords[1]; // Consume multi-segment tile KeyValuePairT tile_carry = ConsumeTile( tile_idx, tile_start_coord, tile_end_coord, Int2Type()); // Output the tile's carry-out if (threadIdx.x == 0) { if (HAS_ALPHA) { tile_carry.value *= spmv_params.alpha; } tile_carry.key += tile_start_coord.x; if (tile_carry.key >= spmv_params.num_rows) { // FIXME: This works around an invalid memory access in the // fixup kernel. The underlying issue needs to be debugged and // properly fixed, but this hack prevents writes to // out-of-bounds addresses. It doesn't appear to have an effect // on the validity of the results, since this only affects the // carry-over from last tile in the input. tile_carry.key = spmv_params.num_rows - 1; tile_carry.value = ValueT{}; }; d_tile_carry_pairs[tile_idx] = tile_carry; } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_sub_warp_merge_sort.cuh000066400000000000000000000260751434614775400221120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template < int WARP_THREADS_ARG, int ITEMS_PER_THREAD_ARG, cub::WarpLoadAlgorithm LOAD_ALGORITHM_ARG = cub::WARP_LOAD_DIRECT, cub::CacheLoadModifier LOAD_MODIFIER_ARG = cub::LOAD_LDG, cub::WarpStoreAlgorithm STORE_ALGORITHM_ARG = cub::WARP_STORE_DIRECT> struct AgentSubWarpMergeSortPolicy { static constexpr int WARP_THREADS = WARP_THREADS_ARG; static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG; static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD; static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM = LOAD_ALGORITHM_ARG; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = LOAD_MODIFIER_ARG; static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG; }; template struct AgentSmallAndMediumSegmentedSortPolicy { static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG; using SmallPolicyT = SmallPolicy; using MediumPolicyT = MediumPolicy; constexpr static int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS / MediumPolicyT::WARP_THREADS; constexpr static int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS; }; /** * @brief AgentSubWarpSort implements a sub-warp merge sort. * * This agent can work with any power of two number of threads, not exceeding * 32. The number of threads is defined in the `PolicyT::WARP_THREADS`. Virtual * warp of `PolicyT::WARP_THREADS` will efficiently load data using * `PolicyT::LOAD_ALGORITHM`, sort it using `WarpMergeSort`, and store it back * using `PolicyT::STORE_ALGORITHM`. * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam PolicyT * Chained tuning policy * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets */ template class AgentSubWarpSort { struct BinaryOpT { template __device__ bool operator()(T lhs, T rhs) { return this->impl(lhs, rhs); } #if defined(__CUDA_FP16_TYPES_EXIST__) __device__ bool operator()(__half lhs, __half rhs) { // Need to explicitly cast to float for SM <= 52. NV_IF_TARGET(NV_PROVIDES_SM_53, (return this->impl(lhs, rhs);), (return this->impl(__half2float(lhs), __half2float(rhs));)); } #endif private: template __device__ bool impl(T lhs, T rhs) { if (IS_DESCENDING) { return lhs > rhs; } else { return lhs < rhs; } } }; #if defined(__CUDA_FP16_TYPES_EXIST__) __device__ static bool equal(__half lhs, __half rhs) { // Need to explicitly cast to float for SM <= 52. NV_IF_TARGET(NV_PROVIDES_SM_53, (return lhs == rhs;), (return __half2float(lhs) == __half2float(rhs);)); } #endif template __device__ static bool equal(T lhs, T rhs) { return lhs == rhs; } public: static constexpr bool KEYS_ONLY = std::is_same::value; using WarpMergeSortT = WarpMergeSort; using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core:: LoadIterator::type; using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core:: LoadIterator::type; using WarpLoadKeysT = cub::WarpLoad; using WarpLoadItemsT = cub::WarpLoad; using WarpStoreKeysT = cub::WarpStore; using WarpStoreItemsT = cub::WarpStore; union _TempStorage { typename WarpLoadKeysT::TempStorage load_keys; typename WarpLoadItemsT::TempStorage load_items; typename WarpMergeSortT::TempStorage sort; typename WarpStoreKeysT::TempStorage store_keys; typename WarpStoreItemsT::TempStorage store_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage &storage; __device__ __forceinline__ explicit AgentSubWarpSort(TempStorage &temp_storage) : storage(temp_storage.Alias()) { } __device__ __forceinline__ void ProcessSegment(int segment_size, KeysLoadItT keys_input, KeyT *keys_output, ItemsLoadItT values_input, ValueT *values_output) { WarpMergeSortT warp_merge_sort(storage.sort); if (segment_size < 3) { ShortCircuit(warp_merge_sort.get_linear_tid(), segment_size, keys_input, keys_output, values_input, values_output, BinaryOpT{}); } else { KeyT keys[PolicyT::ITEMS_PER_THREAD]; ValueT values[PolicyT::ITEMS_PER_THREAD]; // For FP64 the difference is: // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b using UnsignedBitsT = typename Traits::UnsignedBits; UnsignedBitsT default_key_bits = IS_DESCENDING ? Traits::LOWEST_KEY : Traits::MAX_KEY; KeyT oob_default = reinterpret_cast(default_key_bits); WarpLoadKeysT(storage.load_keys) .Load(keys_input, keys, segment_size, oob_default); WARP_SYNC(warp_merge_sort.get_member_mask()); if (!KEYS_ONLY) { WarpLoadItemsT(storage.load_items) .Load(values_input, values, segment_size); WARP_SYNC(warp_merge_sort.get_member_mask()); } warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default); WARP_SYNC(warp_merge_sort.get_member_mask()); WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size); if (!KEYS_ONLY) { WARP_SYNC(warp_merge_sort.get_member_mask()); WarpStoreItemsT(storage.store_items) .Store(values_output, values, segment_size); } } } private: /** * This method implements a shortcut for sorting less than three items. * Only the first thread of a virtual warp is used for soring. */ template __device__ __forceinline__ void ShortCircuit(unsigned int linear_tid, OffsetT segment_size, KeysLoadItT keys_input, KeyT *keys_output, ItemsLoadItT values_input, ValueT *values_output, CompareOpT binary_op) { if (segment_size == 1) { if (linear_tid == 0) { if (keys_input.ptr != keys_output) { keys_output[0] = keys_input[0]; } if (!KEYS_ONLY) { if (values_input.ptr != values_output) { values_output[0] = values_input[0]; } } } } else if (segment_size == 2) { if (linear_tid == 0) { KeyT lhs = keys_input[0]; KeyT rhs = keys_input[1]; if (equal(lhs, rhs) || binary_op(lhs, rhs)) { keys_output[0] = lhs; keys_output[1] = rhs; if (!KEYS_ONLY) { if (values_output != values_input.ptr) { values_output[0] = values_input[0]; values_output[1] = values_input[1]; } } } else { keys_output[0] = rhs; keys_output[1] = lhs; if (!KEYS_ONLY) { // values_output might be an alias for values_input, so // we have to use registers here const ValueT lhs_val = values_input[0]; const ValueT rhs_val = values_input[1]; values_output[0] = rhs_val; values_output[1] = lhs_val; } } } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_three_way_partition.cuh000066400000000000000000000525401434614775400221160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ template struct AgentThreeWayPartitionPolicy { constexpr static int BLOCK_THREADS = _BLOCK_THREADS; constexpr static int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; constexpr static BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; constexpr static CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; constexpr static BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /** * \brief Implements a device-wide three-way partitioning * * Splits input data into three parts based on the selection functors. If the * first functor selects an item, the algorithm places it in the first part. * Otherwise, if the second functor selects an item, the algorithm places it in * the second part. If both functors don't select an item, the algorithm places * it into the unselected part. */ template struct AgentThreeWayPartition { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input value type using InputT = cub::detail::value_t; // Tile status descriptor interface type using ScanTileStateT = cub::ScanTileState; // Constants constexpr static int BLOCK_THREADS = PolicyT::BLOCK_THREADS; constexpr static int ITEMS_PER_THREAD = PolicyT::ITEMS_PER_THREAD; constexpr static int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; using WrappedInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, cub::CacheModifiedInputIterator, InputIteratorT>; // Parameterized BlockLoad type for input data using BlockLoadT = cub::BlockLoad; // Parameterized BlockScan type using BlockScanT = cub::BlockScan; // Callback type for obtaining tile prefix during block scan using TilePrefixCallbackOpT = cub::TilePrefixCallbackOp; // Item exchange type using ItemExchangeT = InputT[TILE_ITEMS]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; } scan_storage; // Smem needed for loading items typename BlockLoadT::TempStorage load_items; // Smem needed for compacting items (allows non POD items in this union) cub::Uninitialized raw_exchange; }; // Alias wrapper allowing storage to be unioned struct TempStorage : cub::Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input items FirstOutputIteratorT d_first_part_out; SecondOutputIteratorT d_second_part_out; UnselectedOutputIteratorT d_unselected_out; SelectFirstPartOp select_first_part_op; SelectSecondPartOp select_second_part_op; OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor __device__ __forceinline__ AgentThreeWayPartition(TempStorage &temp_storage, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_first_part_out(d_first_part_out) , d_second_part_out(d_second_part_out) , d_unselected_out(d_unselected_out) , select_first_part_op(select_first_part_op) , select_second_part_op(select_second_part_op) , num_items(num_items) {} //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- template __device__ __forceinline__ void Initialize(OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], OffsetT (&first_items_selection_flags)[ITEMS_PER_THREAD], OffsetT (&second_items_selection_flags)[ITEMS_PER_THREAD]) { for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags first_items_selection_flags[ITEM] = 1; second_items_selection_flags[ITEM] = 1; if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) { first_items_selection_flags[ITEM] = select_first_part_op(items[ITEM]); second_items_selection_flags[ITEM] = first_items_selection_flags[ITEM] ? 0 : select_second_part_op(items[ITEM]); } } } template __device__ __forceinline__ void Scatter( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&first_items_selection_flags)[ITEMS_PER_THREAD], OffsetT (&first_items_selection_indices)[ITEMS_PER_THREAD], OffsetT (&second_items_selection_flags)[ITEMS_PER_THREAD], OffsetT (&second_items_selection_indices)[ITEMS_PER_THREAD], int num_tile_items, int num_first_tile_selections, int num_second_tile_selections, OffsetT num_first_selections_prefix, OffsetT num_second_selections_prefix, OffsetT num_rejected_prefix) { CTA_SYNC(); int first_item_end = num_first_tile_selections; int second_item_end = first_item_end + num_second_tile_selections; // Scatter items to shared memory (rejections first) for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { int local_scatter_offset = 0; if (first_items_selection_flags[ITEM]) { local_scatter_offset = first_items_selection_indices[ITEM] - num_first_selections_prefix; } else if (second_items_selection_flags[ITEM]) { local_scatter_offset = first_item_end + second_items_selection_indices[ITEM] - num_second_selections_prefix; } else { // Medium item int local_selection_idx = (first_items_selection_indices[ITEM] - num_first_selections_prefix) + (second_items_selection_indices[ITEM] - num_second_selections_prefix); local_scatter_offset = second_item_end + item_idx - local_selection_idx; } temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; } } CTA_SYNC(); // Gather items from shared memory and scatter to global for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { InputT item = temp_storage.raw_exchange.Alias()[item_idx]; if (item_idx < first_item_end) { d_first_part_out[num_first_selections_prefix + item_idx] = item; } else if (item_idx < second_item_end) { d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item; } else { int rejection_idx = item_idx - second_item_end; d_unselected_out[num_rejected_prefix + rejection_idx] = item; } } } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process first tile of input (dynamic chained scan). * Returns the running count of selections (including this tile) * * @param num_tile_items Number of input items comprising this tile * @param tile_offset Tile offset * @param first_tile_state Global tile state descriptor * @param second_tile_state Global tile state descriptor */ template __device__ __forceinline__ void ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT &first_tile_state, ScanTileStateT &second_tile_state, OffsetT &first_items, OffsetT &second_items) { InputT items[ITEMS_PER_THREAD]; OffsetT first_items_selection_flags[ITEMS_PER_THREAD]; OffsetT first_items_selection_indices[ITEMS_PER_THREAD]; OffsetT second_items_selection_flags[ITEMS_PER_THREAD]; OffsetT second_items_selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) { BlockLoadT(temp_storage.load_items) .Load(d_in + tile_offset, items, num_tile_items); } else { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); } // Initialize selection_flags Initialize( num_tile_items, items, first_items_selection_flags, second_items_selection_flags); CTA_SYNC(); // Exclusive scan of selection_flags BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(first_items_selection_flags, first_items_selection_indices, first_items); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) { first_tile_state.SetInclusive(0, first_items); } } CTA_SYNC(); // Exclusive scan of selection_flags BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(second_items_selection_flags, second_items_selection_indices, second_items); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) { second_tile_state.SetInclusive(0, second_items); } } // Discount any out-of-bounds selections if (IS_LAST_TILE) { first_items -= (TILE_ITEMS - num_tile_items); second_items -= (TILE_ITEMS - num_tile_items); } // Scatter flagged items Scatter( items, first_items_selection_flags, first_items_selection_indices, second_items_selection_flags, second_items_selection_indices, num_tile_items, first_items, second_items, // all the prefixes equal to 0 because it's the first tile 0, 0, 0); } /** * Process subsequent tile of input (dynamic chained scan). * Returns the running count of selections (including this tile) * * @param num_tile_items Number of input items comprising this tile * @param tile_idx Tile index * @param tile_offset Tile offset * @param first_tile_state Global tile state descriptor * @param second_tile_state Global tile state descriptor */ template __device__ __forceinline__ void ConsumeSubsequentTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &first_tile_state, ScanTileStateT &second_tile_state, OffsetT &num_first_items_selections, OffsetT &num_second_items_selections) { InputT items[ITEMS_PER_THREAD]; OffsetT first_items_selection_flags[ITEMS_PER_THREAD]; OffsetT first_items_selection_indices[ITEMS_PER_THREAD]; OffsetT second_items_selection_flags[ITEMS_PER_THREAD]; OffsetT second_items_selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) { BlockLoadT(temp_storage.load_items).Load( d_in + tile_offset, items, num_tile_items); } else { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); } // Initialize selection_flags Initialize( num_tile_items, items, first_items_selection_flags, second_items_selection_flags); CTA_SYNC(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT first_prefix_op(first_tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(first_items_selection_flags, first_items_selection_indices, first_prefix_op); num_first_items_selections = first_prefix_op.GetInclusivePrefix(); OffsetT num_first_items_in_tile_selections = first_prefix_op.GetBlockAggregate(); OffsetT num_first_items_selections_prefix = first_prefix_op.GetExclusivePrefix(); CTA_SYNC(); TilePrefixCallbackOpT second_prefix_op(second_tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(second_items_selection_flags, second_items_selection_indices, second_prefix_op); num_second_items_selections = second_prefix_op.GetInclusivePrefix(); OffsetT num_second_items_in_tile_selections = second_prefix_op.GetBlockAggregate(); OffsetT num_second_items_selections_prefix = second_prefix_op.GetExclusivePrefix(); OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_first_items_selections_prefix - num_second_items_selections_prefix; // Discount any out-of-bounds selections. There are exactly // TILE_ITEMS - num_tile_items elements like that because we // marked them as selected in Initialize method. if (IS_LAST_TILE) { const int num_discount = TILE_ITEMS - num_tile_items; num_first_items_selections -= num_discount; num_first_items_in_tile_selections -= num_discount; num_second_items_selections -= num_discount; num_second_items_in_tile_selections -= num_discount; } // Scatter flagged items Scatter( items, first_items_selection_flags, first_items_selection_indices, second_items_selection_flags, second_items_selection_indices, num_tile_items, num_first_items_in_tile_selections, num_second_items_in_tile_selections, num_first_items_selections_prefix, num_second_items_selections_prefix, num_rejected_prefix); } /** * Process a tile of input */ template __device__ __forceinline__ void ConsumeTile( int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& first_tile_state, ScanTileStateT& second_tile_state, OffsetT& first_items, OffsetT& second_items) { if (tile_idx == 0) { ConsumeFirstTile(num_tile_items, tile_offset, first_tile_state, second_tile_state, first_items, second_items); } else { ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, first_tile_state, second_tile_state, first_items, second_items); } } /** * Scan tiles of items as part of a dynamic chained scan * * @tparam NumSelectedIteratorT * Output iterator type for recording number of items selection_flags * * @param num_tiles * Total number of input tiles * * @param first_tile_state * Global tile state descriptor * * @param second_tile_state * Global tile state descriptor * * @param d_num_selected_out * Output total number selection_flags */ template __device__ __forceinline__ void ConsumeRange(int num_tiles, ScanTileStateT &first_tile_state, ScanTileStateT &second_tile_state, NumSelectedIteratorT d_num_selected_out) { // Blocks are launched in increasing order, so just assign one tile per block // Current tile index int tile_idx = static_cast((blockIdx.x * gridDim.y) + blockIdx.y); // Global offset for the current tile OffsetT tile_offset = tile_idx * TILE_ITEMS; OffsetT num_first_selections; OffsetT num_second_selections; if (tile_idx < num_tiles - 1) { // Not the last tile (full) ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, first_tile_state, second_tile_state, num_first_selections, num_second_selections); } else { // The last tile (possibly partially-full) OffsetT num_remaining = num_items - tile_offset; ConsumeTile(num_remaining, tile_idx, tile_offset, first_tile_state, second_tile_state, num_first_selections, num_second_selections); if (threadIdx.x == 0) { // Output the total number of items selection_flags d_num_selected_out[0] = num_first_selections; d_num_selected_out[1] = num_second_selections; } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/agent_unique_by_key.cuh000066400000000000000000000514431434614775400207070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key. */ #pragma once #include #include #include "../thread/thread_operators.cuh" #include "../block/block_load.cuh" #include "../block/block_scan.cuh" #include "../agent/single_pass_scan_operators.cuh" #include "../block/block_discontinuity.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentUniqueByKey */ template struct AgentUniqueByKeyPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ITEMS_PER_THREAD = _ITEMS_PER_THREAD, }; static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key */ template < typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type typename KeyInputIteratorT, ///< Random-access input iterator type for keys typename ValueInputIteratorT, ///< Random-access input iterator type for values typename KeyOutputIteratorT, ///< Random-access output iterator type for keys typename ValueOutputIteratorT, ///< Random-access output iterator type for values typename EqualityOpT, ///< Equality operator type typename OffsetT> ///< Signed integer type for global offsets struct AgentUniqueByKey { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input key and value type using KeyT = typename std::iterator_traits::value_type; using ValueT = typename std::iterator_traits::value_type; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; // Constants enum { BLOCK_THREADS = AgentUniqueByKeyPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD, ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD, }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys using WrappedKeyInputIteratorT = typename std::conditional::value, CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator KeyInputIteratorT>::type; // Directly use the supplied input iterator type // Cache-modified Input iterator wrapper type (for applying cache modifier) for values using WrappedValueInputIteratorT = typename std::conditional::value, CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator ValueInputIteratorT>::type; // Directly use the supplied input iterator type // Parameterized BlockLoad type for input data using BlockLoadKeys = BlockLoad< KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>; // Parameterized BlockLoad type for flags using BlockLoadValues = BlockLoad< ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>; // Parameterized BlockDiscontinuity type for items using BlockDiscontinuityKeys = cub::BlockDiscontinuity; // Parameterized BlockScan type using BlockScanT = cub::BlockScan; // Parameterized BlockDiscontinuity type for items using TilePrefixCallback = cub::TilePrefixCallbackOp; // Key exchange type using KeyExchangeT = KeyT[ITEMS_PER_TILE]; // Value exchange type using ValueExchangeT = ValueT[ITEMS_PER_TILE]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { typename BlockScanT::TempStorage scan; typename TilePrefixCallback::TempStorage prefix; typename BlockDiscontinuityKeys::TempStorage discontinuity; } scan_storage; // Smem needed for loading keys typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading values typename BlockLoadValues::TempStorage load_values; // Smem needed for compacting items (allows non POD items in this union) Uninitialized shared_keys; Uninitialized shared_values; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; WrappedKeyInputIteratorT d_keys_in; WrappedValueInputIteratorT d_values_in; KeyOutputIteratorT d_keys_out; ValueOutputIteratorT d_values_out; cub::InequalityWrapper inequality_op; OffsetT num_items; //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor __device__ __forceinline__ AgentUniqueByKey( TempStorage &temp_storage_, WrappedKeyInputIteratorT d_keys_in_, WrappedValueInputIteratorT d_values_in_, KeyOutputIteratorT d_keys_out_, ValueOutputIteratorT d_values_out_, EqualityOpT equality_op_, OffsetT num_items_) : temp_storage(temp_storage_.Alias()), d_keys_in(d_keys_in_), d_values_in(d_values_in_), d_keys_out(d_keys_out_), d_values_out(d_values_out_), inequality_op(equality_op_), num_items(num_items_) {} //--------------------------------------------------------------------- // Utility functions //--------------------------------------------------------------------- struct KeyTagT {}; struct ValueTagT {}; __device__ __forceinline__ KeyExchangeT &GetShared(KeyTagT) { return temp_storage.shared_keys.Alias(); } __device__ __forceinline__ ValueExchangeT &GetShared(ValueTagT) { return temp_storage.shared_values.Alias(); } //--------------------------------------------------------------------- // Scatter utility methods //--------------------------------------------------------------------- template __device__ __forceinline__ void Scatter( Tag tag, OutputIt items_out, T (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int /*num_tile_items*/, int num_tile_selections, OffsetT num_selections_prefix, OffsetT /*num_selections*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; if (selection_flags[ITEM]) { GetShared(tag)[local_scatter_offset] = items[ITEM]; } } CTA_SYNC(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { items_out[num_selections_prefix + item] = GetShared(tag)[item]; } CTA_SYNC(); } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) */ template __device__ __forceinline__ OffsetT ConsumeFirstTile( int num_tile_items, ///< Number of input items comprising this tile OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { KeyT keys[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_idx[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadKeys(temp_storage.load_keys) .Load(d_keys_in + tile_offset, keys, num_tile_items, *(d_keys_in + tile_offset)); } else { BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } CTA_SYNC(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadValues(temp_storage.load_values) .Load(d_values_in + tile_offset, values, num_tile_items, *(d_values_in + tile_offset)); } else { BlockLoadValues(temp_storage.load_values) .Load(d_values_in + tile_offset, values); } CTA_SYNC(); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(selection_flags, keys, inequality_op); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) selection_flags[ITEM] = 1; } CTA_SYNC(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; OffsetT num_selections_prefix = 0; BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(selection_flags, selection_idx, num_tile_selections); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) tile_state.SetInclusive(0, num_tile_selections); } // Do not count any out-of-bounds selections if (IS_LAST_TILE) { int num_discount = ITEMS_PER_TILE - num_tile_items; num_tile_selections -= num_discount; } num_selections = num_tile_selections; CTA_SYNC(); Scatter(KeyTagT(), d_keys_out, keys, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); CTA_SYNC(); Scatter(ValueTagT(), d_values_out, values, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); return num_selections; } /** * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) */ template __device__ __forceinline__ OffsetT ConsumeSubsequentTile( int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { KeyT keys[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_idx[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadKeys(temp_storage.load_keys) .Load(d_keys_in + tile_offset, keys, num_tile_items, *(d_keys_in + tile_offset)); } else { BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } CTA_SYNC(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadValues(temp_storage.load_values) .Load(d_values_in + tile_offset, values, num_tile_items, *(d_values_in + tile_offset)); } else { BlockLoadValues(temp_storage.load_values) .Load(d_values_in + tile_offset, values); } CTA_SYNC(); KeyT tile_predecessor = d_keys_in[tile_offset - 1]; BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) selection_flags[ITEM] = 1; } CTA_SYNC(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; OffsetT num_selections_prefix = 0; TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(selection_flags, selection_idx, prefix_cb); num_selections = prefix_cb.GetInclusivePrefix(); num_tile_selections = prefix_cb.GetBlockAggregate(); num_selections_prefix = prefix_cb.GetExclusivePrefix(); if (IS_LAST_TILE) { int num_discount = ITEMS_PER_TILE - num_tile_items; num_tile_selections -= num_discount; num_selections -= num_discount; } CTA_SYNC(); Scatter(KeyTagT(), d_keys_out, keys, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); CTA_SYNC(); Scatter(ValueTagT(), d_values_out, values, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); return num_selections; } /** * Process a tile of input */ template __device__ __forceinline__ OffsetT ConsumeTile( int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { OffsetT num_selections; if (tile_idx == 0) { num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); } else { num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); } return num_selections; } /** * Scan tiles of items as part of a dynamic chained scan */ template ///< Output iterator type for recording number of items selection_flags __device__ __forceinline__ void ConsumeRange( int num_tiles, ///< Total number of input tiles ScanTileStateT& tile_state, ///< Global tile state descriptor NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile if (tile_idx < num_tiles - 1) { ConsumeTile(ITEMS_PER_TILE, tile_idx, tile_offset, tile_state); } else { int num_remaining = static_cast(num_items - tile_offset); OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); if (threadIdx.x == 0) { *d_num_selected_out = num_selections; } } } }; CUB_NAMESPACE_END cub-2.0.1/cub/agent/single_pass_scan_operators.cuh000066400000000000000000000656321434614775400222770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Callback operator types for supplying BlockScan prefixes */ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Prefix functor type for maintaining a running prefix while scanning a * region independent of other thread blocks ******************************************************************************/ /** * Stateful callback operator type for supplying BlockScan prefixes. * Maintains a running prefix that can be applied to consecutive * BlockScan operations. */ template < typename T, ///< BlockScan value type typename ScanOpT> ///< Wrapped scan operator type struct BlockScanRunningPrefixOp { ScanOpT op; ///< Wrapped scan operator T running_total; ///< Running block-wide prefix /// Constructor __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) : op(op) {} /// Constructor __device__ __forceinline__ BlockScanRunningPrefixOp( T starting_prefix, ScanOpT op) : op(op), running_total(starting_prefix) {} /** * Prefix callback operator. Returns the block-wide running_total in thread-0. */ __device__ __forceinline__ T operator()( const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs { T retval = running_total; running_total = op(running_total, block_aggregate); return retval; } }; /****************************************************************************** * Generic tile status interface types for block-cooperative scans ******************************************************************************/ /** * Enumerations of tile status */ enum ScanTileStatus { SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) SCAN_TILE_INVALID = 99, // Not yet processed SCAN_TILE_PARTIAL, // Tile aggregate is available SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available }; /** * Tile status interface. */ template < typename T, bool SINGLE_WORD = Traits::PRIMITIVE> struct ScanTileState; /** * Tile status interface specialized for scan status and value types * that can be combined into one machine word that can be * read/written coherently in a single access. */ template struct ScanTileState { // Status word type using StatusWord = cub::detail::conditional_t< sizeof(T) == 8, long long, cub::detail::conditional_t< sizeof(T) == 4, int, cub::detail::conditional_t>>; // Unit word type using TxnWord = cub::detail::conditional_t< sizeof(T) == 8, longlong2, cub::detail::conditional_t< sizeof(T) == 4, int2, cub::detail::conditional_t>>; // Device word type struct TileDescriptor { StatusWord status; T value; }; // Constants enum { TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, }; // Device storage TxnWord *d_tile_descriptors; /// Constructor __host__ __device__ __forceinline__ ScanTileState() : d_tile_descriptors(NULL) {} /// Initializer __host__ __device__ __forceinline__ cudaError_t Init( int /*num_tiles*/, ///< [in] Number of tiles void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation { d_tile_descriptors = reinterpret_cast(d_temp_storage); return cudaSuccess; } /** * Compute device memory needed for tile status */ __host__ __device__ __forceinline__ static cudaError_t AllocationSize( int num_tiles, ///< [in] Number of tiles size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation { temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors return cudaSuccess; } /** * Initialize (from device) */ __device__ __forceinline__ void InitializeStatus(int num_tiles) { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; TxnWord val = TxnWord(); TileDescriptor *descriptor = reinterpret_cast(&val); if (tile_idx < num_tiles) { // Not-yet-set descriptor->status = StatusWord(SCAN_TILE_INVALID); d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; } if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) { // Padding descriptor->status = StatusWord(SCAN_TILE_OOB); d_tile_descriptors[threadIdx.x] = val; } } /** * Update the specified tile's inclusive value and corresponding status */ __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_INCLUSIVE; tile_descriptor.value = tile_inclusive; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Update the specified tile's partial value and corresponding status */ __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_PARTIAL; tile_descriptor.value = tile_partial; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Wait for the corresponding tile to become non-invalid */ __device__ __forceinline__ void WaitForValid( int tile_idx, StatusWord &status, T &value) { TileDescriptor tile_descriptor; do { __threadfence_block(); // prevent hoisting loads from loop TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); status = tile_descriptor.status; value = tile_descriptor.value; } }; /** * Tile status interface specialized for scan status and value types that * cannot be combined into one machine word. */ template struct ScanTileState { // Status word type typedef char StatusWord; // Constants enum { TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, }; // Device storage StatusWord *d_tile_status; T *d_tile_partial; T *d_tile_inclusive; /// Constructor __host__ __device__ __forceinline__ ScanTileState() : d_tile_status(NULL), d_tile_partial(NULL), d_tile_inclusive(NULL) {} /// Initializer __host__ __device__ __forceinline__ cudaError_t Init( int num_tiles, ///< [in] Number of tiles void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation { cudaError_t error = cudaSuccess; do { void* allocations[3] = {}; size_t allocation_sizes[3]; allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives // Compute allocation pointers into the single storage blob if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; // Alias the offsets d_tile_status = reinterpret_cast(allocations[0]); d_tile_partial = reinterpret_cast(allocations[1]); d_tile_inclusive = reinterpret_cast(allocations[2]); } while (0); return error; } /** * Compute device memory needed for tile status */ __host__ __device__ __forceinline__ static cudaError_t AllocationSize( int num_tiles, ///< [in] Number of tiles size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation { // Specify storage allocation requirements size_t allocation_sizes[3]; allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives // Set the necessary size of the blob void* allocations[3] = {}; return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); } /** * Initialize (from device) */ __device__ __forceinline__ void InitializeStatus(int num_tiles) { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; if (tile_idx < num_tiles) { // Not-yet-set d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); } if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) { // Padding d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); } } /** * Update the specified tile's inclusive value and corresponding status */ __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) { // Update tile inclusive value ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); // Fence __threadfence(); // Update tile status ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); } /** * Update the specified tile's partial value and corresponding status */ __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) { // Update tile partial value ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); // Fence __threadfence(); // Update tile status ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); } /** * Wait for the corresponding tile to become non-invalid */ __device__ __forceinline__ void WaitForValid( int tile_idx, StatusWord &status, T &value) { do { status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); __threadfence(); // prevent hoisting loads from loop or loads below above this one } while (status == SCAN_TILE_INVALID); if (status == StatusWord(SCAN_TILE_PARTIAL)) value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); else value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); } }; /****************************************************************************** * ReduceByKey tile status interface types for block-cooperative scans ******************************************************************************/ /** * Tile status interface for reduction by key. * */ template < typename ValueT, typename KeyT, bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> struct ReduceByKeyScanTileState; /** * Tile status interface for reduction by key, specialized for scan status and value types that * cannot be combined into one machine word. */ template < typename ValueT, typename KeyT> struct ReduceByKeyScanTileState : ScanTileState > { typedef ScanTileState > SuperClass; /// Constructor __host__ __device__ __forceinline__ ReduceByKeyScanTileState() : SuperClass() {} }; /** * Tile status interface for reduction by key, specialized for scan status and value types that * can be combined into one machine word that can be read/written coherently in a single access. */ template < typename ValueT, typename KeyT> struct ReduceByKeyScanTileState { typedef KeyValuePairKeyValuePairT; // Constants enum { PAIR_SIZE = static_cast(sizeof(ValueT) + sizeof(KeyT)), TXN_WORD_SIZE = 1 << Log2::VALUE, STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, }; // Status word type using StatusWord = cub::detail::conditional_t< STATUS_WORD_SIZE == 8, long long, cub::detail::conditional_t< STATUS_WORD_SIZE == 4, int, cub::detail::conditional_t>>; // Status word type using TxnWord = cub::detail::conditional_t< TXN_WORD_SIZE == 16, longlong2, cub::detail::conditional_t>; // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) struct TileDescriptorBigStatus { KeyT key; ValueT value; StatusWord status; }; // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) struct TileDescriptorLittleStatus { ValueT value; StatusWord status; KeyT key; }; // Device word type using TileDescriptor = cub::detail::conditional_t; // Device storage TxnWord *d_tile_descriptors; /// Constructor __host__ __device__ __forceinline__ ReduceByKeyScanTileState() : d_tile_descriptors(NULL) {} /// Initializer __host__ __device__ __forceinline__ cudaError_t Init( int /*num_tiles*/, ///< [in] Number of tiles void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation { d_tile_descriptors = reinterpret_cast(d_temp_storage); return cudaSuccess; } /** * Compute device memory needed for tile status */ __host__ __device__ __forceinline__ static cudaError_t AllocationSize( int num_tiles, ///< [in] Number of tiles size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation { temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors return cudaSuccess; } /** * Initialize (from device) */ __device__ __forceinline__ void InitializeStatus(int num_tiles) { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; TxnWord val = TxnWord(); TileDescriptor *descriptor = reinterpret_cast(&val); if (tile_idx < num_tiles) { // Not-yet-set descriptor->status = StatusWord(SCAN_TILE_INVALID); d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; } if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) { // Padding descriptor->status = StatusWord(SCAN_TILE_OOB); d_tile_descriptors[threadIdx.x] = val; } } /** * Update the specified tile's inclusive value and corresponding status */ __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_INCLUSIVE; tile_descriptor.value = tile_inclusive.value; tile_descriptor.key = tile_inclusive.key; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Update the specified tile's partial value and corresponding status */ __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_PARTIAL; tile_descriptor.value = tile_partial.value; tile_descriptor.key = tile_partial.key; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Wait for the corresponding tile to become non-invalid */ __device__ __forceinline__ void WaitForValid( int tile_idx, StatusWord &status, KeyValuePairT &value) { // TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); // TileDescriptor tile_descriptor = reinterpret_cast(alias); // // while (tile_descriptor.status == SCAN_TILE_INVALID) // { // __threadfence_block(); // prevent hoisting loads from loop // // alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); // tile_descriptor = reinterpret_cast(alias); // } // // status = tile_descriptor.status; // value.value = tile_descriptor.value; // value.key = tile_descriptor.key; TileDescriptor tile_descriptor; do { __threadfence_block(); // prevent hoisting loads from loop TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); status = tile_descriptor.status; value.value = tile_descriptor.value; value.key = tile_descriptor.key; } }; /****************************************************************************** * Prefix call-back operator for coupling local block scan within a * block-cooperative scan ******************************************************************************/ /** * Stateful block-scan prefix functor. Provides the the running prefix for * the current tile by using the call-back warp to wait on on * aggregates/prefixes from predecessor tiles to become available. */ template < typename T, typename ScanOpT, typename ScanTileStateT, int LEGACY_PTX_ARCH = 0> struct TilePrefixCallbackOp { // Parameterized warp reduce typedef WarpReduce WarpReduceT; // Temporary storage type struct _TempStorage { typename WarpReduceT::TempStorage warp_reduce; T exclusive_prefix; T inclusive_prefix; T block_aggregate; }; // Alias wrapper allowing temporary storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Type of status word typedef typename ScanTileStateT::StatusWord StatusWord; // Fields _TempStorage& temp_storage; ///< Reference to a warp-reduction instance ScanTileStateT& tile_status; ///< Interface to tile status ScanOpT scan_op; ///< Binary scan operator int tile_idx; ///< The current tile index T exclusive_prefix; ///< Exclusive prefix for the tile T inclusive_prefix; ///< Inclusive prefix for the tile // Constructor __device__ __forceinline__ TilePrefixCallbackOp( ScanTileStateT &tile_status, TempStorage &temp_storage, ScanOpT scan_op, int tile_idx) : temp_storage(temp_storage.Alias()), tile_status(tile_status), scan_op(scan_op), tile_idx(tile_idx) {} // Block until all predecessors within the warp-wide window have non-invalid status __device__ __forceinline__ void ProcessWindow( int predecessor_idx, ///< Preceding tile index to inspect StatusWord &predecessor_status, ///< [out] Preceding tile status T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles { T value; tile_status.WaitForValid(predecessor_idx, predecessor_status, value); // Perform a segmented reduction to get the prefix for the current window. // Use the swizzled scan operator because we are now scanning *down* towards thread0. int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( value, tail_flag, SwizzleScanOp(scan_op)); } // BlockScan prefix callback functor (called by the first warp) __device__ __forceinline__ T operator()(T block_aggregate) { // Update our status with our tile-aggregate if (threadIdx.x == 0) { detail::uninitialized_copy(&temp_storage.block_aggregate, block_aggregate); tile_status.SetPartial(tile_idx, block_aggregate); } int predecessor_idx = tile_idx - threadIdx.x - 1; StatusWord predecessor_status; T window_aggregate; // Wait for the warp-wide window of predecessor tiles to become valid ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); // The exclusive tile prefix starts out as the current window aggregate exclusive_prefix = window_aggregate; // Keep sliding the window back until we come across a tile whose inclusive prefix is known while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) { predecessor_idx -= CUB_PTX_WARP_THREADS; // Update exclusive tile prefix with the window prefix ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); } // Compute the inclusive tile prefix and update the status for this tile if (threadIdx.x == 0) { inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); tile_status.SetInclusive(tile_idx, inclusive_prefix); detail::uninitialized_copy(&temp_storage.exclusive_prefix, exclusive_prefix); detail::uninitialized_copy(&temp_storage.inclusive_prefix, inclusive_prefix); } // Return exclusive_prefix return exclusive_prefix; } // Get the exclusive prefix stored in temporary storage __device__ __forceinline__ T GetExclusivePrefix() { return temp_storage.exclusive_prefix; } // Get the inclusive prefix stored in temporary storage __device__ __forceinline__ T GetInclusivePrefix() { return temp_storage.inclusive_prefix; } // Get the block aggregate stored in temporary storage __device__ __forceinline__ T GetBlockAggregate() { return temp_storage.block_aggregate; } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/000077500000000000000000000000001434614775400141455ustar00rootroot00000000000000cub-2.0.1/cub/block/block_adjacent_difference.cuh000066400000000000000000001465431434614775400217600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::BlockAdjacentDifference class provides * [collective](index.html#sec0) methods for computing the differences * of adjacent elements partitioned across a CUDA thread block. */ #pragma once #include "../config.cuh" #include "../util_type.cuh" #include "../util_ptx.cuh" CUB_NAMESPACE_BEGIN /** * @brief BlockAdjacentDifference provides * [collective](index.html#sec0) methods for computing the * differences of adjacent elements partitioned across a CUDA thread * block. * * @ingroup BlockModule * * @par Overview * - BlockAdjacentDifference calculates the differences of adjacent elements in * the elements partitioned across a CUDA thread block. Because the binary * operation could be noncommutative, there are two sets of methods. * Methods named SubtractLeft subtract left element `i - 1` of input sequence * from current element `i`. Methods named SubtractRight subtract current * element `i` from the right one `i + 1`: * @par * @code * int values[4]; // [1, 2, 3, 4] * //... * int subtract_left_result[4]; <-- [ 1, 1, 1, 1 ] * int subtract_right_result[4]; <-- [ -1, -1, -1, 4 ] * @endcode * - For SubtractLeft, if the left element is out of bounds, the * output value is assigned to `input[0]` without modification. * - For SubtractRight, if the right element is out of bounds, the output value * is assigned to the current input value without modification. * - The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * This example can be easily adapted to the storage required by * BlockAdjacentDifference. * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference to * compute the left difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute adjacent_difference * int result[4]; * * BlockAdjacentDifferenceT(temp_storage).SubtractLeft( * result, * thread_data, * CustomDifference()); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }. * The corresponding output `result` in those threads will be * { [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }. * */ template class BlockAdjacentDifference { private: /*************************************************************************** * Constants and type definitions **************************************************************************/ /// Constants /// The thread block size in threads static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; /// Shared memory storage layout type (last element from each thread's input) struct _TempStorage { T first_items[BLOCK_THREADS]; T last_items[BLOCK_THREADS]; }; /*************************************************************************** * Utility methods **************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Specialization for when FlagOp has third index param template ::HAS_PARAM> struct ApplyOp { // Apply flag operator static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx) { return flag_op(b, a, idx); } }; /// Specialization for when FlagOp does not have a third index param template struct ApplyOp { // Apply flag operator static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) { return flag_op(b, a); } }; /// Templated unrolling of item comparison (inductive case) struct Iterate { /** * Head flags * * @param[out] flags Calling thread's discontinuity head_flags * @param[in] input Calling thread's input items * @param[out] preds Calling thread's predecessor items * @param[in] flag_op Binary boolean flag predicate */ template static __device__ __forceinline__ void FlagHeads(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op) { #pragma unroll for (int i = 1; i < ITEMS_PER_THREAD; ++i) { preds[i] = input[i - 1]; flags[i] = ApplyOp::FlagT( flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i); } } /** * Tail flags * * @param[out] flags Calling thread's discontinuity head_flags * @param[in] input Calling thread's input items * @param[in] flag_op Binary boolean flag predicate */ template static __device__ __forceinline__ void FlagTails(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) { flags[i] = ApplyOp::FlagT( flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1); } } }; /*************************************************************************** * Thread fields **************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// \smemstorage{BlockDiscontinuity} struct TempStorage : Uninitialized<_TempStorage> {}; /***********************************************************************//** * @name Collective constructors **************************************************************************/ //@{ /** * @brief Collective constructor using a private static allocation of shared * memory as temporary storage. */ __device__ __forceinline__ BlockAdjacentDifference() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as * temporary storage. * * @param[in] temp_storage Reference to memory allocation having layout type TempStorage */ __device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /***********************************************************************//** * @name Read left operations **************************************************************************/ //@{ /** * @brief Subtracts the left element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the left difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block * // of 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractLeft( * thread_data, * thread_data, * CustomDifference()); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. * The corresponding output `result` in those threads will be * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to @p output) * * @param[in] difference_op * Binary difference operator */ template __device__ __forceinline__ void SubtractLeft(T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } if (linear_tid == 0) { output[0] = input[0]; } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } /** * @brief Subtracts the left element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the left difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // The last item in the previous tile: * int tile_predecessor_item = ...; * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractLeft( * thread_data, * thread_data, * CustomDifference(), * tile_predecessor_item); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. * and that `tile_predecessor_item` is `3`. The corresponding output * `result` in those threads will be * `{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to \p output) * * @param[in] difference_op * Binary difference operator * * @param[in] tile_predecessor_item * [thread0 only] item which is going to be * subtracted from the first tile item (input0 from * thread0). */ template __device__ __forceinline__ void SubtractLeft(T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } // Set flag for first thread-item if (linear_tid == 0) { output[0] = difference_op(input[0], tile_predecessor_item); } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } /** * @brief Subtracts the left element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the left difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * int valid_items = 9; * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( * thread_data, * thread_data, * CustomDifference(), * valid_items); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. * The corresponding output `result` in those threads will be * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to \p output) * * @param[in] difference_op * Binary difference operator * * @param[in] valid_items * Number of valid items in thread block */ template __device__ __forceinline__ void SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } } else { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { const int idx = linear_tid * ITEMS_PER_THREAD + item; if (idx < valid_items) { output[item] = difference_op(input[item], input[item - 1]); } else { output[item] = input[item]; } } } if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD) { output[0] = input[0]; } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } /** * @brief Subtracts the left element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the left difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * int valid_items = 9; * int tile_predecessor_item = 4; * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( * thread_data, * thread_data, * CustomDifference(), * valid_items, * tile_predecessor_item); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. * The corresponding output `result` in those threads will be * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to \p output) * * @param[in] difference_op * Binary difference operator * * @param[in] valid_items * Number of valid items in thread block * * @param[in] tile_predecessor_item * **[thread0 only]** item which is going to be * subtracted from the first tile item (input0 from * thread0). */ template __device__ __forceinline__ void SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items, T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } } else { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { const int idx = linear_tid * ITEMS_PER_THREAD + item; if (idx < valid_items) { output[item] = difference_op(input[item], input[item - 1]); } else { output[item] = input[item]; } } } if (valid_items <= linear_tid * ITEMS_PER_THREAD) { output[0] = input[0]; } else if (linear_tid == 0) { output[0] = difference_op(input[0], tile_predecessor_item); } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } //@} end member group /******************************************************************//** * @name Read right operations *********************************************************************/ //@{ /** * @brief Subtracts the right element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the right difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractRight( * thread_data, * thread_data, * CustomDifference()); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`. * The corresponding output `result` in those threads will be * `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to \p output) * * @param[in] difference_op * Binary difference operator */ template __device__ __forceinline__ void SubtractRight(T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) { output[item] = difference_op(input[item], input[item + 1]); } if (linear_tid == BLOCK_THREADS - 1) { output[ITEMS_PER_THREAD - 1] = input[ITEMS_PER_THREAD - 1]; } else { output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1]); } } /** * @brief Subtracts the right element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the right difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // The first item in the nest tile: * int tile_successor_item = ...; * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractRight( * thread_data, * thread_data, * CustomDifference(), * tile_successor_item); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`, * and that `tile_successor_item` is `3`. The corresponding output `result` * in those threads will be * `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to @p output) * * @param[in] difference_op * Binary difference operator * * @param[in] tile_successor_item * [threadBLOCK_THREADS-1 only] item * which is going to be subtracted from the last tile item * (inputITEMS_PER_THREAD-1 from * threadBLOCK_THREADS-1). */ template __device__ __forceinline__ void SubtractRight(T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, T tile_successor_item) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item // Last thread : temp_storage.first_items[linear_tid + 1]; #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) { output[item] = difference_op(input[item], input[item + 1]); } output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], successor_item); } /** * @brief Subtracts the right element of each adjacent pair in range of * elements partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet * The code snippet below illustrates how to use @p BlockAdjacentDifference to * compute the right difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute adjacent_difference * BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile( * thread_data, * thread_data, * CustomDifference(), * valid_items); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`. * and that `valid_items` is `507`. The corresponding output `result` in * those threads will be * `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }`. * * @param[out] output * Calling thread's adjacent difference result * * @param[in] input * Calling thread's input items (may be aliased to @p output) * * @param[in] difference_op * Binary difference operator * * @param[in] valid_items * Number of valid items in thread block */ template __device__ __forceinline__ void SubtractRightPartialTile(T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items) { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) { output[item] = difference_op(input[item], input[item + 1]); } output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1]); } else { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = linear_tid * ITEMS_PER_THREAD + item; // Right element of input[valid_items - 1] is out of bounds. // According to the API it's copied into output array // without modification. if (idx < valid_items - 1) { output[item] = difference_op(input[item], input[item + 1]); } else { output[item] = input[item]; } } } } //@} end member group /******************************************************************//** * @name Head flag operations (deprecated) *********************************************************************/ //@{ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagHeads( FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if (linear_tid == 0) { // Set flag for first thread-item (preds[0] is undefined) output[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; output[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set output for remaining items Iterate::FlagHeads(linear_tid, output, input, preds, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. */ template CUB_DEPRECATED __device__ __forceinline__ void FlagHeads( FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items FlagOp flag_op, ///< [in] Binary boolean flag predicate T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; output[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set output for remaining items Iterate::FlagHeads(linear_tid, output, input, preds, flag_op); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. */ template CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { T preds[ITEMS_PER_THREAD]; FlagHeads(output, input, preds, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. */ template CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op, ///< [in] Binary boolean flag predicate T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). { T preds[ITEMS_PER_THREAD]; FlagHeads(output, input, preds, flag_op, tile_predecessor_item); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagTails( FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item output[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set output for remaining items Iterate::FlagTails(linear_tid, output, input, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagTails( FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op, ///< [in] Binary boolean flag predicate T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; output[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set output for remaining items Iterate::FlagTails(linear_tid, output, input, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = temp_storage.last_items[linear_tid - 1]; if (linear_tid == 0) { head_flags[0] = 1; } else { head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item if (linear_tid == 0) { head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_discontinuity.cuh000066400000000000000000001472351434614775400207410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. */ #pragma once #include "../config.cuh" #include "../util_type.cuh" #include "../util_ptx.cuh" CUB_NAMESPACE_BEGIN /** * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) * \ingroup BlockModule * * \tparam T The data type to be flagged. * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items * that differ from their predecessors (or successors). For example, head flags are convenient * for demarcating disjoint data segments as part of a segmented scan or reduction. * - \blocked * * \par Performance Considerations * - \granularity * * \par A Simple Example * \blockcollective{BlockDiscontinuity} * \par * The code snippet below illustrates the head flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute head flags for discontinuities in the segment * int head_flags[4]; * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. * The corresponding output \p head_flags in those threads will be * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * * \par Performance Considerations * - Incurs zero bank conflicts for most types * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockDiscontinuity. */ template < typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockDiscontinuity { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Shared memory storage layout type (last element from each thread's input) struct _TempStorage { T first_items[BLOCK_THREADS]; T last_items[BLOCK_THREADS]; }; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Specialization for when FlagOp has third index param template ::HAS_PARAM> struct ApplyOp { // Apply flag operator static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx) { return flag_op(a, b, idx); } }; /// Specialization for when FlagOp does not have a third index param template struct ApplyOp { // Apply flag operator static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) { return flag_op(a, b); } }; /// Templated unrolling of item comparison (inductive case) struct Iterate { // Head flags template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> static __device__ __forceinline__ void FlagHeads( int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items FlagOp flag_op) ///< [in] Binary boolean flag predicate { #pragma unroll for (int i = 1; i < ITEMS_PER_THREAD; ++i) { preds[i] = input[i - 1]; flags[i] = ApplyOp::FlagT( flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i); } } // Tail flags template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> static __device__ __forceinline__ void FlagTails( int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) { flags[i] = ApplyOp::FlagT( flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1); } } }; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// \smemstorage{BlockDiscontinuity} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockDiscontinuity() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockDiscontinuity( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Head flag operations *********************************************************************/ //@{ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if (linear_tid == 0) { // Set flag for first thread-item (preds[0] is undefined) head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); } template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items FlagOp flag_op, ///< [in] Binary boolean flag predicate T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. * * \par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) * returns \p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is always flagged. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the head-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute head flags for discontinuities in the segment * int head_flags[4]; * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. * The corresponding output \p head_flags in those threads will be * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { T preds[ITEMS_PER_THREAD]; FlagHeads(head_flags, input, preds, flag_op); } /** * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. * * \par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) * returns \p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is compared * against \p tile_predecessor_item. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the head-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Have thread0 obtain the predecessor item for the entire tile * int tile_predecessor_item; * if (threadIdx.x == 0) tile_predecessor_item == ... * * // Collectively compute head flags for discontinuities in the segment * int head_flags[4]; * BlockDiscontinuity(temp_storage).FlagHeads( * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op, ///< [in] Binary boolean flag predicate T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). { T preds[ITEMS_PER_THREAD]; FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); } //@} end member group /******************************************************************//** * \name Tail flag operations *********************************************************************/ //@{ /** * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. * * \par * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) * returns \p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is always flagged. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute tail flags for discontinuities in the segment * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. * The corresponding output \p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagTails( FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. * * \par * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) * returns \p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is compared * against \p tile_successor_item. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Have thread127 obtain the successor item for the entire tile * int tile_successor_item; * if (threadIdx.x == 127) tile_successor_item == ... * * // Collectively compute tail flags for discontinuities in the segment * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails( * tail_flags, thread_data, cub::Inequality(), tile_successor_item); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagTails( FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op, ///< [in] Binary boolean flag predicate T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //@} end member group /******************************************************************//** * \name Head & tail flag operations *********************************************************************/ //@{ /** * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. * * \par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) * returns \p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is always flagged. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) * returns \p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is always flagged. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute head and flags for discontinuities in the segment * int head_flags[4]; * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails( * head_flags, tail_flags, thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } * and that the tile_successor_item is \p 125. The corresponding output \p head_flags * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * and the corresponding output \p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item if (linear_tid == 0) { head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. * * \par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) * returns \p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is always flagged. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) * returns \p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is compared * against \p tile_predecessor_item. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Have thread127 obtain the successor item for the entire tile * int tile_successor_item; * if (threadIdx.x == 127) tile_successor_item == ... * * // Collectively compute head and flags for discontinuities in the segment * int head_flags[4]; * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails( * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } * and that the tile_successor_item is \p 125. The corresponding output \p head_flags * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * and the corresponding output \p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item if (linear_tid == 0) { head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. * * \par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) * returns \p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is compared * against \p tile_predecessor_item. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) * returns \p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is always flagged. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Have thread0 obtain the predecessor item for the entire tile * int tile_predecessor_item; * if (threadIdx.x == 0) tile_predecessor_item == ... * * // Have thread127 obtain the successor item for the entire tile * int tile_successor_item; * if (threadIdx.x == 127) tile_successor_item == ... * * // Collectively compute head and flags for discontinuities in the segment * int head_flags[4]; * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails( * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, * thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, * that the \p tile_predecessor_item is \p 0, and that the * \p tile_successor_item is \p 125. The corresponding output \p head_flags * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * and the corresponding output \p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. * * \par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) * returns \p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is compared * against \p tile_predecessor_item. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) * returns \p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is compared * against \p tile_successor_item. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int * typedef cub::BlockDiscontinuity BlockDiscontinuity; * * // Allocate shared memory for BlockDiscontinuity * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Have thread0 obtain the predecessor item for the entire tile * int tile_predecessor_item; * if (threadIdx.x == 0) tile_predecessor_item == ... * * // Have thread127 obtain the successor item for the entire tile * int tile_successor_item; * if (threadIdx.x == 127) tile_successor_item == ... * * // Collectively compute head and flags for discontinuities in the segment * int head_flags[4]; * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails( * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, * thread_data, cub::Inequality()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, * that the \p tile_predecessor_item is \p 0, and that the * \p tile_successor_item is \p 125. The corresponding output \p head_flags * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * and the corresponding output \p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam FlagT [inferred] The flag type (must be an integer type) * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. */ template < int ITEMS_PER_THREAD, typename FlagT, typename FlagOp> __device__ __forceinline__ void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items FlagOp flag_op) ///< [in] Binary boolean flag predicate { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT( flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_exchange.cuh000066400000000000000000001417041434614775400176110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. */ #pragma once #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) * \ingroup BlockModule * * \tparam T The data type to be exchanged. * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - It is commonplace for blocks of threads to rearrange data items between * threads. For example, the device-accessible memory subsystem prefers access patterns * where data items are "striped" across threads (where consecutive threads access consecutive items), * yet most block-wide operations prefer a "blocked" partitioning of items across threads * (where consecutive items belong to a single thread). * - BlockExchange supports the following types of data exchanges: * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) * - \rowmajor * * \par A Simple Example * \blockcollective{BlockExchange} * \par * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockExchange BlockExchange; * * // Allocate shared memory for BlockExchange * __shared__ typename BlockExchange::TempStorage temp_storage; * * // Load a tile of data striped across threads * int thread_data[4]; * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); * * // Collectively exchange data into a blocked arrangement across threads * BlockExchange(temp_storage).StripedToBlocked(thread_data); * * \endcode * \par * Suppose the set of striped input \p thread_data across the block of threads is * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. * The corresponding output \p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * * \par Performance Considerations * - Proper device-specific padding ensures zero bank conflicts for most types. * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockExchange. */ template < typename InputT, int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockExchange { private: /****************************************************************************** * Constants ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0), SMEM_BANKS = 1 << LOG_SMEM_BANKS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, }; /****************************************************************************** * Type definitions ******************************************************************************/ /// Shared memory storage layout type struct __align__(16) _TempStorage { InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; }; public: /// \smemstorage{BlockExchange} struct TempStorage : Uninitialized<_TempStorage> {}; private: /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; unsigned int lane_id; unsigned int warp_id; unsigned int warp_offset; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /** * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. */ template __device__ __forceinline__ void BlockedToStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; output_items[ITEM] = temp_storage.buff[item_offset]; } } /** * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. */ template __device__ __forceinline__ void BlockedToStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // Read a strip of items const int STRIP_OFFSET = ITEM * BLOCK_THREADS; const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) { int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; temp_items[ITEM] = temp_storage.buff[item_offset]; } } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } /** * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing */ template __device__ __forceinline__ void BlockedToWarpStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; output_items[ITEM] = temp_storage.buff[item_offset]; } } /** * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing */ template __device__ __forceinline__ void BlockedToWarpStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { if (warp_id == 0) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; output_items[ITEM] = temp_storage.buff[item_offset]; } } #pragma unroll for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) { CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; output_items[ITEM] = temp_storage.buff[item_offset]; } } } } /** * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. */ template __device__ __forceinline__ void StripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); // No timeslicing #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; output_items[ITEM] = temp_storage.buff[item_offset]; } } /** * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. */ template __device__ __forceinline__ void StripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { // Warp time-slicing InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // Write a strip of items const int STRIP_OFFSET = ITEM * BLOCK_THREADS; const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) { int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } } CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; temp_items[ITEM] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } /** * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing */ template __device__ __forceinline__ void WarpStripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(output_items + ITEM, temp_storage.buff[item_offset]); } } /** * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing */ template __device__ __forceinline__ void WarpStripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. Int2Type /*time_slicing*/) { #pragma unroll for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) { CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; output_items[ITEM] = temp_storage.buff[item_offset]; } } } } /** * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. */ template __device__ __forceinline__ void ScatterToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); output_items[ITEM] = temp_storage.buff[item_offset]; } } /** * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. */ template __device__ __forceinline__ void ScatterToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { CTA_SYNC(); const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM] - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); temp_items[ITEM] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } /** * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. */ template __device__ __forceinline__ void ScatterToStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); output_items[ITEM] = temp_storage.buff[item_offset]; } } /** * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. */ template __device__ __forceinline__ void ScatterToStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM] - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // Read a strip of items const int STRIP_OFFSET = ITEM * BLOCK_THREADS; const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) { int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; temp_items[ITEM] = temp_storage.buff[item_offset]; } } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } public: /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockExchange() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), lane_id(LaneId()), warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockExchange( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), lane_id(LaneId()), warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} //@} end member group /******************************************************************//** * \name Structured exchanges *********************************************************************/ //@{ /** * \brief Transposes data items from striped arrangement to blocked arrangement. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockExchange BlockExchange; * * // Allocate shared memory for BlockExchange * __shared__ typename BlockExchange::TempStorage temp_storage; * * // Load a tile of ordered data into a striped arrangement across block threads * int thread_data[4]; * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); * * // Collectively exchange data into a blocked arrangement across threads * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); * * \endcode * \par * Suppose the set of striped input \p thread_data across the block of threads is * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. * The corresponding output \p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * */ template __device__ __forceinline__ void StripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. { StripedToBlocked(input_items, output_items, Int2Type()); } /** * \brief Transposes data items from blocked arrangement to striped arrangement. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockExchange BlockExchange; * * // Allocate shared memory for BlockExchange * __shared__ typename BlockExchange::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively exchange data into a striped arrangement across threads * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); * * // Store data striped across block threads into an ordered tile * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); * * \endcode * \par * Suppose the set of blocked input \p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * The corresponding output \p thread_data in those threads will be * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in * preparation for storing to device-accessible memory. * */ template __device__ __forceinline__ void BlockedToStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. { BlockedToStriped(input_items, output_items, Int2Type()); } /** * \brief Transposes data items from warp-striped arrangement to blocked arrangement. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockExchange BlockExchange; * * // Allocate shared memory for BlockExchange * __shared__ typename BlockExchange::TempStorage temp_storage; * * // Load a tile of ordered data into a warp-striped arrangement across warp threads * int thread_data[4]; * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); * * // Collectively exchange data into a blocked arrangement across threads * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); * * \endcode * \par * Suppose the set of warp-striped input \p thread_data across the block of threads is * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } * after loading from device-accessible memory. (The first 128 items are striped across * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) * The corresponding output \p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * */ template __device__ __forceinline__ void WarpStripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. { WarpStripedToBlocked(input_items, output_items, Int2Type()); } /** * \brief Transposes data items from blocked arrangement to warp-striped arrangement. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockExchange BlockExchange; * * // Allocate shared memory for BlockExchange * __shared__ typename BlockExchange::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively exchange data into a warp-striped arrangement across threads * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); * * // Store data striped across warp threads into an ordered tile * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); * * \endcode * \par * Suppose the set of blocked input \p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * The corresponding output \p thread_data in those threads will be * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } * in preparation for storing to device-accessible memory. (The first 128 items are striped across * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) * */ template __device__ __forceinline__ void BlockedToWarpStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. { BlockedToWarpStriped(input_items, output_items, Int2Type()); } //@} end member group /******************************************************************//** * \name Scatter exchanges *********************************************************************/ //@{ /** * \brief Exchanges data items annotated by rank into blocked arrangement. * * \par * - \smemreuse * * \tparam OffsetT [inferred] Signed integer type for local offsets */ template __device__ __forceinline__ void ScatterToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks { ScatterToBlocked(input_items, output_items, ranks, Int2Type()); } /** * \brief Exchanges data items annotated by rank into striped arrangement. * * \par * - \smemreuse * * \tparam OffsetT [inferred] Signed integer type for local offsets */ template __device__ __forceinline__ void ScatterToStriped( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks { ScatterToStriped(input_items, output_items, ranks, Int2Type()); } /** * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. * * \par * - \smemreuse * * \tparam OffsetT [inferred] Signed integer type for local offsets */ template __device__ __forceinline__ void ScatterToStripedGuarded( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); if (ranks[ITEM] >= 0) temp_storage.buff[item_offset] = input_items[ITEM]; } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); output_items[ITEM] = temp_storage.buff[item_offset]; } } /** * \brief Exchanges valid data items annotated by rank into striped arrangement. * * \par * - \smemreuse * * \tparam OffsetT [inferred] Signed integer type for local offsets * \tparam ValidFlag [inferred] FlagT type denoting which items are valid */ template __device__ __forceinline__ void ScatterToStripedFlagged( InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks ValidFlag (&is_valid)[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); if (is_valid[ITEM]) temp_storage.buff[item_offset] = input_items[ITEM]; } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); output_items[ITEM] = temp_storage.buff[item_offset]; } } //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document __device__ __forceinline__ void StripedToBlocked( InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. { StripedToBlocked(items, items); } __device__ __forceinline__ void BlockedToStriped( InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. { BlockedToStriped(items, items); } __device__ __forceinline__ void WarpStripedToBlocked( InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. { WarpStripedToBlocked(items, items); } __device__ __forceinline__ void BlockedToWarpStriped( InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. { BlockedToWarpStriped(items, items); } template __device__ __forceinline__ void ScatterToBlocked( InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks { ScatterToBlocked(items, items, ranks); } template __device__ __forceinline__ void ScatterToStriped( InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks { ScatterToStriped(items, items, ranks); } template __device__ __forceinline__ void ScatterToStripedGuarded( InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks { ScatterToStripedGuarded(items, items, ranks); } template __device__ __forceinline__ void ScatterToStripedFlagged( InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks ValidFlag (&is_valid)[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity { ScatterToStriped(items, items, ranks, is_valid); } #endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_histogram.cuh000066400000000000000000000405111434614775400200160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ #pragma once #include "specializations/block_histogram_sort.cuh" #include "specializations/block_histogram_atomic.cuh" #include "../config.cuh" #include "../util_ptx.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ /** * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. */ enum BlockHistogramAlgorithm { /** * \par Overview * Sorting followed by differentiation. Execution is comprised of two phases: * -# Sort the data using efficient radix sort * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. * * \par Performance Considerations * Delivers consistent throughput regardless of sample bin distribution. */ BLOCK_HISTO_SORT, /** * \par Overview * Use atomic addition to update byte counts directly * * \par Performance Considerations * Performance is strongly tied to the hardware implementation of atomic * addition, and may be significantly degraded for non uniformly-random * input distributions where many concurrent updates are likely to be * made to the same bin counter. */ BLOCK_HISTO_ATOMIC, }; /****************************************************************************** * Block histogram ******************************************************************************/ /** * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) * \ingroup BlockModule * * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of items per thread * \tparam BINS The number bins within the histogram * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A histogram * counts the number of observations that fall into each of the disjoint categories (known as bins). * - The `T` type must be implicitly castable to an integer type. * - BlockHistogram expects each integral `input[i]` value to satisfy * `0 <= input[i] < BINS`. Values outside of this range result in undefined * behavior. * - BlockHistogram can be optionally specialized to use different algorithms: * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) * * \par Performance Considerations * - \granularity * * \par A Simple Example * \blockcollective{BlockHistogram} * \par * The code snippet below illustrates a 256-bin histogram of 512 integer samples that * are partitioned across 128 threads where each thread owns 4 samples. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each * typedef cub::BlockHistogram BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; * * // Allocate shared memory for block-wide histogram bin counts * __shared__ unsigned int smem_histogram[256]; * * // Obtain input samples per thread * unsigned char data[4]; * ... * * // Compute the block-wide histogram * BlockHistogram(temp_storage).Histogram(data, smem_histogram); * * \endcode * * \par Performance and Usage Considerations * - All input values must fall between [0, BINS), or behavior is undefined. * - The histogram output can be constructed in shared or device-accessible memory * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockHistogram. */ template < typename T, int BLOCK_DIM_X, int ITEMS_PER_THREAD, int BINS, BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockHistogram { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Internal specialization. using InternalBlockHistogram = cub::detail::conditional_t, BlockHistogramAtomic>; /// Shared memory storage layout type for BlockHistogram typedef typename InternalBlockHistogram::TempStorage _TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: /// \smemstorage{BlockHistogram} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockHistogram() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockHistogram( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Histogram operations *********************************************************************/ //@{ /** * \brief Initialize the shared histogram counters to zero. * * \par Snippet * The code snippet below illustrates a the initialization and update of a * histogram of 512 integer samples that are partitioned across 128 threads * where each thread owns 4 samples. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each * typedef cub::BlockHistogram BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; * * // Allocate shared memory for block-wide histogram bin counts * __shared__ unsigned int smem_histogram[256]; * * // Obtain input samples per thread * unsigned char thread_samples[4]; * ... * * // Initialize the block-wide histogram * BlockHistogram(temp_storage).InitHistogram(smem_histogram); * * // Update the block-wide histogram * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); * * \endcode * * \tparam CounterT [inferred] Histogram counter type */ template __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) { // Initialize histogram bin counts to zeros int histo_offset = 0; #pragma unroll for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { histogram[histo_offset + linear_tid] = 0; } // Finish up with guarded initialization if necessary if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { histogram[histo_offset + linear_tid] = 0; } } /** * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. * * \par * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a 256-bin histogram of 512 integer samples that * are partitioned across 128 threads where each thread owns 4 samples. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each * typedef cub::BlockHistogram BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; * * // Allocate shared memory for block-wide histogram bin counts * __shared__ unsigned int smem_histogram[256]; * * // Obtain input samples per thread * unsigned char thread_samples[4]; * ... * * // Compute the block-wide histogram * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); * * \endcode * * \tparam CounterT [inferred] Histogram counter type */ template < typename CounterT > __device__ __forceinline__ void Histogram( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram { // Initialize histogram bin counts to zeros InitHistogram(histogram); CTA_SYNC(); // Composite the histogram InternalBlockHistogram(temp_storage).Composite(items, histogram); } /** * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. * * \par * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a the initialization and update of a * histogram of 512 integer samples that are partitioned across 128 threads * where each thread owns 4 samples. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each * typedef cub::BlockHistogram BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; * * // Allocate shared memory for block-wide histogram bin counts * __shared__ unsigned int smem_histogram[256]; * * // Obtain input samples per thread * unsigned char thread_samples[4]; * ... * * // Initialize the block-wide histogram * BlockHistogram(temp_storage).InitHistogram(smem_histogram); * * // Update the block-wide histogram * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); * * \endcode * * \tparam CounterT [inferred] Histogram counter type */ template < typename CounterT > __device__ __forceinline__ void Composite( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram { InternalBlockHistogram(temp_storage).Composite(items, histogram); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_load.cuh000066400000000000000000001564651434614775400167600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Operations for reading linear tiles of data into the CUDA thread block. */ #pragma once #include #include #include "../block/block_exchange.cuh" #include "../iterator/cache_modified_input_iterator.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIo * @{ */ /******************************************************************//** * \name Blocked arrangement I/O (direct) *********************************************************************/ //@{ /** * \brief Load a linear segment of items into a blocked arrangement across the thread block. * * \blocked * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { // Load directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; } } /** * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. * * \blocked * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) { items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; } } } /** * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. * * \blocked * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) items[ITEM] = oob_default; LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Internal implementation for load vectorization */ template < CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void InternalLoadDirectBlockedVectorized( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) T *block_ptr, ///< [in] Input pointer for loading from T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { // Biggest memory access word that T is a whole multiple of typedef typename UnitWord::DeviceWord DeviceWord; enum { TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? 4 : (TOTAL_WORDS % 2 == 0) ? 2 : 1, VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, }; // Vector type typedef typename CubVector::Type Vector; // Vector items Vector vec_items[VECTORS_PER_THREAD]; // Aliased input ptr Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); // Load directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) { vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); } } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Load a linear segment of items into a blocked arrangement across the thread block. * * \blocked * * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned * * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: * - \p ITEMS_PER_THREAD is odd * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. */ template < typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void LoadDirectBlockedVectorized( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) T *block_ptr, ///< [in] Input pointer for loading from T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } //@} end member group /******************************************************************//** * \name Striped arrangement I/O (direct) *********************************************************************/ //@{ /** * \brief Load a linear segment of items into a striped arrangement across the thread block. * * \striped * * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; } } /** * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range * * \striped * * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) { items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; } } } /** * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. * * \striped * * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < int BLOCK_THREADS, typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) items[ITEM] = oob_default; LoadDirectStriped(linear_tid, block_itr, items, valid_items); } //@} end member group /******************************************************************//** * \name Warp-striped arrangement I/O (direct) *********************************************************************/ //@{ /** * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. * * \warpstriped * * \par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); } } /** * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range * * \warpstriped * * \par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) { new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); } } } /** * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. * * \warpstriped * * \par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * * \tparam T [inferred] The data type to load. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. */ template < typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT> __device__ __forceinline__ void LoadDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { // Load directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) items[ITEM] = oob_default; LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); } //@} end member group /** @} */ // end group UtilIo //----------------------------------------------------------------------------- // Generic BlockLoad abstraction //----------------------------------------------------------------------------- /** * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. */ enum BlockLoadAlgorithm { /** * \par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is read * directly from memory. * * \par Performance Considerations * The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ BLOCK_LOAD_DIRECT, /** * \par Overview * * A [striped arrangement](index.html#sec5sec3) of data is read * directly from memory. * * \par Performance Considerations * The utilization of memory transactions (coalescing) doesn't depend on * the number of items per thread. */ BLOCK_LOAD_STRIPED, /** * \par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is read * from memory using CUDA's built-in vectorized loads as a coalescing optimization. * For example, ld.global.v4.s32 instructions will be generated * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until the the * access stride between threads (i.e., the number items per thread) exceeds the * maximum vector load width (typically 4 items or 64B, whichever is lower). * - The following conditions will prevent vectorization and loading will fall * back to cub::BLOCK_LOAD_DIRECT: * - \p ITEMS_PER_THREAD is odd * - The \p InputIteratorT is not a simple pointer type * - The block input offset is not quadword-aligned * - The data type \p T is not a built-in primitive or CUDA vector type * (e.g., \p short, \p int2, \p double, \p float2, etc.) */ BLOCK_LOAD_VECTORIZE, /** * \par Overview * * A [striped arrangement](index.html#sec5sec3) of data is read * efficiently from memory and then locally transposed into a * [blocked arrangement](index.html#sec5sec3). * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items loaded per thread. * - The local reordering incurs slightly longer latencies and throughput than the * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. */ BLOCK_LOAD_TRANSPOSE, /** * \par Overview * * A [warp-striped arrangement](index.html#sec5sec3) of data is * read efficiently from memory and then locally transposed into a * [blocked arrangement](index.html#sec5sec3). * * \par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items loaded per thread. * - The local reordering incurs slightly larger latencies than the * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. * - Provisions more shared storage, but incurs smaller latencies than the * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. */ BLOCK_LOAD_WARP_TRANSPOSE, /** * \par Overview * * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) * of data is read directly from memory and then is locally transposed into a * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory * requirement, only one warp's worth of shared memory is provisioned and is * subsequently time-sliced among warps. * * \par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items loaded per thread. * - Provisions less shared memory temporary storage, but incurs larger * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. */ BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, }; /** * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) * \ingroup BlockModule * \ingroup UtilIo * * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - The BlockLoad class provides a single data movement abstraction that can be specialized * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different * performance policies for different architectures, data types, granularity sizes, etc. * - BlockLoad can be optionally specialized by different data movement strategies: * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory using CUDA's built-in vectorized loads as a * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) * of data is read directly from memory and is then locally transposed into a * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) * of data is read directly from memory and is then locally transposed into a * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,. A [warp-striped arrangement](index.html#sec5sec3) * of data is read directly from memory and is then locally transposed into a * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) * - \rowmajor * * \par A Simple Example * \blockcollective{BlockLoad} * \par * The code snippet below illustrates the loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockLoad BlockLoad; * * // Allocate shared memory for BlockLoad * __shared__ typename BlockLoad::TempStorage temp_storage; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data); * * \endcode * \par * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... * The set of \p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockLoad. */ template < typename InputT, int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockLoad { private: /****************************************************************************** * Constants and typed definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /****************************************************************************** * Algorithmic variants ******************************************************************************/ /// Load helper template struct LoadInternal; /** * BLOCK_LOAD_DIRECT specialization of load helper */ template struct LoadInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ LoadInternal( TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Load a linear segment of items from memory template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { LoadDirectBlocked(linear_tid, block_itr, items); } /// Load a linear segment of items from memory, guarded by range template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; /** * BLOCK_LOAD_STRIPED specialization of load helper */ template struct LoadInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ LoadInternal( TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Load a linear segment of items from memory template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { LoadDirectStriped(linear_tid, block_itr, items); } /// Load a linear segment of items from memory, guarded by range template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { LoadDirectStriped(linear_tid, block_itr, items, valid_items); } /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); } }; /** * BLOCK_LOAD_VECTORIZE specialization of load helper */ template struct LoadInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ LoadInternal( TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) template __device__ __forceinline__ void Load( InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) template __device__ __forceinline__ void Load( const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) template < CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT> __device__ __forceinline__ void Load( CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); } /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) template __device__ __forceinline__ void Load( _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { LoadDirectBlocked(linear_tid, block_itr, items); } /// Load a linear segment of items from memory, guarded by range (skips vectorization) template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; /** * BLOCK_LOAD_TRANSPOSE specialization of load helper */ template struct LoadInternal { // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ LoadInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} /// Load a linear segment of items from memory template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ { LoadDirectStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).StripedToBlocked(items, items); } /// Load a linear segment of items from memory, guarded by range template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { LoadDirectStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).StripedToBlocked(items, items); } /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).StripedToBlocked(items, items); } }; /** * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper */ template struct LoadInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ LoadInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} /// Load a linear segment of items from memory template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ { LoadDirectWarpStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /// Load a linear segment of items from memory, guarded by range template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } }; /** * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper */ template struct LoadInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ LoadInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} /// Load a linear segment of items from memory template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ { LoadDirectWarpStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /// Load a linear segment of items from memory, guarded by range template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } }; /****************************************************************************** * Type definitions ******************************************************************************/ /// Internal load implementation to use typedef LoadInternal InternalLoad; /// Shared memory storage layout type typedef typename InternalLoad::TempStorage _TempStorage; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /****************************************************************************** * Thread fields ******************************************************************************/ /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; public: /// \smemstorage{BlockLoad} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockLoad() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockLoad( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Data movement *********************************************************************/ //@{ /** * \brief Load a linear segment of items from memory. * * \par * - \blocked * - \smemreuse * * \par Snippet * The code snippet below illustrates the loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockLoad BlockLoad; * * // Allocate shared memory for BlockLoad * __shared__ typename BlockLoad::TempStorage temp_storage; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data); * * \endcode * \par * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... * The set of \p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * */ template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { InternalLoad(temp_storage, linear_tid).Load(block_itr, items); } /** * \brief Load a linear segment of items from memory, guarded by range. * * \par * - \blocked * - \smemreuse * * \par Snippet * The code snippet below illustrates the guarded loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) * { * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockLoad BlockLoad; * * // Allocate shared memory for BlockLoad * __shared__ typename BlockLoad::TempStorage temp_storage; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); * * \endcode * \par * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. * The set of \p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads * being unmasked to load portions of valid data (and other items remaining unassigned). * */ template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); } /** * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements * * \par * - \blocked * - \smemreuse * * \par Snippet * The code snippet below illustrates the guarded loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) * { * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockLoad BlockLoad; * * // Allocate shared memory for BlockLoad * __shared__ typename BlockLoad::TempStorage temp_storage; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); * * \endcode * \par * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., * \p valid_items is \p 5, and the out-of-bounds default is \p -1. * The set of \p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads * being unmasked to load portions of valid data (and other items are assigned \p -1) * */ template __device__ __forceinline__ void Load( InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); } //@} end member group }; template > struct BlockLoadType { using type = cub::BlockLoad; }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_merge_sort.cuh000066400000000000000000000701071434614775400201730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include CUB_NAMESPACE_BEGIN // Additional details of the Merge-Path Algorithm can be found in: // S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel // Merging Made Simple", Multithreaded Architectures and Applications (MTAAP) // Workshop, IEEE 26th International Parallel & Distributed Processing // Symposium (IPDPS), 2012 template __device__ __forceinline__ OffsetT MergePath(KeyIteratorT keys1, KeyIteratorT keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred) { OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count; OffsetT keys1_end = (cub::min)(diag, keys1_count); while (keys1_begin < keys1_end) { OffsetT mid = cub::MidPoint(keys1_begin, keys1_end); KeyT key1 = keys1[mid]; KeyT key2 = keys2[diag - 1 - mid]; bool pred = binary_pred(key2, key1); if (pred) { keys1_end = mid; } else { keys1_begin = mid + 1; } } return keys1_begin; } template __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, int keys1_beg, int keys2_beg, int keys1_count, int keys2_count, KeyT (&output)[ITEMS_PER_THREAD], int (&indices)[ITEMS_PER_THREAD], CompareOp compare_op) { int keys1_end = keys1_beg + keys1_count; int keys2_end = keys2_beg + keys2_count; KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); output[item] = p ? key2 : key1; indices[item] = p ? keys2_beg++ : keys1_beg++; if (p) { key2 = keys_shared[keys2_beg]; } else { key1 = keys_shared[keys1_beg]; } } } /** * @brief Generalized merge sort algorithm * * This class is used to reduce code duplication. Warp and Block merge sort * differ only in how they compute thread index and how they synchronize * threads. Since synchronization might require access to custom data * (like member mask), CRTP is used. * * @par * The code snippet below illustrates the way this class can be used. * @par * @code * #include // or equivalently * * constexpr int BLOCK_THREADS = 256; * constexpr int ITEMS_PER_THREAD = 9; * * class BlockMergeSort : public BlockMergeSortStrategy * { * using BlockMergeSortStrategyT = * BlockMergeSortStrategy; * public: * __device__ __forceinline__ explicit BlockMergeSort( * typename BlockMergeSortStrategyT::TempStorage &temp_storage) * : BlockMergeSortStrategyT(temp_storage, threadIdx.x) * {} * * __device__ __forceinline__ void SyncImplementation() const * { * __syncthreads(); * } * }; * @endcode * * @tparam KeyT * KeyT type * * @tparam ValueT * ValueT type. cub::NullType indicates a keys-only sort * * @tparam SynchronizationPolicy * Provides a way of synchronizing threads. Should be derived from * `BlockMergeSortStrategy`. */ template class BlockMergeSortStrategy { static_assert(PowerOfTwo::VALUE, "NUM_THREADS must be a power of two"); private: static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * NUM_THREADS; // Whether or not there are values to be trucked along with keys static constexpr bool KEYS_ONLY = std::is_same::value; /// Shared memory type required by this thread block union _TempStorage { KeyT keys_shared[ITEMS_PER_TILE + 1]; ValueT items_shared[ITEMS_PER_TILE + 1]; }; // union TempStorage /// Shared storage reference _TempStorage &temp_storage; /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } const unsigned int linear_tid; public: /// \smemstorage{BlockMergeSort} struct TempStorage : Uninitialized<_TempStorage> {}; BlockMergeSortStrategy() = delete; explicit __device__ __forceinline__ BlockMergeSortStrategy(unsigned int linear_tid) : temp_storage(PrivateStorage()) , linear_tid(linear_tid) {} __device__ __forceinline__ BlockMergeSortStrategy(TempStorage &temp_storage, unsigned int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} __device__ __forceinline__ unsigned int get_linear_tid() const { return linear_tid; } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * Sort is not guaranteed to be stable. That is, suppose that i and j are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op) { ValueT items[ITEMS_PER_THREAD]; Sort(keys, items, compare_op, ITEMS_PER_TILE, keys[0]); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered * after `oob_default`, it won't be placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { ValueT items[ITEMS_PER_THREAD]; Sort(keys, items, compare_op, valid_items, oob_default); } /** * @brief Sorts items partitioned across a CUDA thread block using a merge sorting method. * * @par * Sort is not guaranteed to be stable. That is, suppose that `i` and `j` are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op) { Sort(keys, items, compare_op, ITEMS_PER_TILE, keys[0]); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered * after `oob_default`, it won't be placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)` * `CompareOp` is a model of [Strict Weak Ordering]. * * @tparam IS_LAST_TILE * True if `valid_items` isn't equal to the `ITEMS_PER_TILE` * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { if (IS_LAST_TILE) { // if last tile, find valid max_key // and fill the remaining keys with it // KeyT max_key = oob_default; #pragma unroll for (int item = 1; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * linear_tid + item < valid_items) { max_key = compare_op(max_key, keys[item]) ? keys[item] : max_key; } else { keys[item] = max_key; } } } // if first element of thread is in input range, stable sort items // if (!IS_LAST_TILE || ITEMS_PER_THREAD * linear_tid < valid_items) { StableOddEvenSort(keys, items, compare_op); } // each thread has sorted keys // merge sort keys in shared memory // #pragma unroll for (int target_merged_threads_number = 2; target_merged_threads_number <= NUM_THREADS; target_merged_threads_number *= 2) { int merged_threads_number = target_merged_threads_number / 2; int mask = target_merged_threads_number - 1; Sync(); // store keys in shmem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = ITEMS_PER_THREAD * linear_tid + item; temp_storage.keys_shared[idx] = keys[item]; } Sync(); int indices[ITEMS_PER_THREAD]; int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid; int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged; int size = ITEMS_PER_THREAD * merged_threads_number; int thread_idx_in_thread_group_being_merged = mask & linear_tid; int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged); int keys1_beg = (cub::min)(valid_items, start); int keys1_end = (cub::min)(valid_items, keys1_beg + size); int keys2_beg = keys1_end; int keys2_end = (cub::min)(valid_items, keys2_beg + size); int keys1_count = keys1_end - keys1_beg; int keys2_count = keys2_end - keys2_beg; int partition_diag = MergePath(&temp_storage.keys_shared[keys1_beg], &temp_storage.keys_shared[keys2_beg], keys1_count, keys2_count, diag, compare_op); int keys1_beg_loc = keys1_beg + partition_diag; int keys1_end_loc = keys1_end; int keys2_beg_loc = keys2_beg + diag - partition_diag; int keys2_end_loc = keys2_end; int keys1_count_loc = keys1_end_loc - keys1_beg_loc; int keys2_count_loc = keys2_end_loc - keys2_beg_loc; SerialMerge(&temp_storage.keys_shared[0], keys1_beg_loc, keys2_beg_loc, keys1_count_loc, keys2_count_loc, keys, indices, compare_op); if (!KEYS_ONLY) { Sync(); // store keys in shmem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = ITEMS_PER_THREAD * linear_tid + item; temp_storage.items_shared[idx] = items[item]; } Sync(); // gather items from shmem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { items[item] = temp_storage.items_shared[indices[item]]; } } } } // func block_merge_sort /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of StableSort is that `x` still precedes `y`. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op) { Sort(keys, compare_op); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of StableSort is that `x` still precedes `y`. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op) { Sort(keys, items, compare_op); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes * `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) * then a postcondition of StableSort is that `x` still precedes `y`. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. * If there is a value that is ordered after `oob_default`, it won't be * placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { Sort(keys, compare_op, valid_items, oob_default); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes * `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) * then a postcondition of StableSort is that `x` still precedes `y`. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered * after `oob_default`, it won't be placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @tparam IS_LAST_TILE * True if `valid_items` isn't equal to the `ITEMS_PER_TILE` * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { Sort(keys, items, compare_op, valid_items, oob_default); } private: __device__ __forceinline__ void Sync() const { static_cast(this)->SyncImplementation(); } }; /** * @brief The BlockMergeSort class provides methods for sorting items * partitioned across a CUDA thread block using a merge sorting method. * @ingroup BlockModule * * @tparam KeyT * KeyT type * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam ITEMS_PER_THREAD * The number of items per thread * * @tparam ValueT * **[optional]** ValueT type (default: `cub::NullType`, which indicates * a keys-only sort) * * @tparam BLOCK_DIM_Y * **[optional]** The thread block length in threads along the Y dimension * (default: 1) * * @tparam BLOCK_DIM_Z * **[optional]** The thread block length in threads along the Z dimension * (default: 1) * * @par Overview * BlockMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types * and comparison functors, but is slower than BlockRadixSort when sorting * arithmetic types into ascending/descending order. * * @par A Simple Example * @blockcollective{BlockMergeSort} * @par * The code snippet below illustrates a sort of 512 integer keys that are * partitioned across 128 threads * where each thread owns 4 consecutive items. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockMergeSort BlockMergeSort; * * // Allocate shared memory for BlockMergeSort * __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess()); * ... * } * @endcode * @par * Suppose the set of input `thread_keys` across the block of threads is * `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. * The corresponding output `thread_keys` in those threads will be * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`. * * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockMergeSort. */ template class BlockMergeSort : public BlockMergeSortStrategy> { private: // The thread block size in threads static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS; using BlockMergeSortStrategyT = BlockMergeSortStrategy; public: __device__ __forceinline__ BlockMergeSort() : BlockMergeSortStrategyT( RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} __device__ __forceinline__ explicit BlockMergeSort( typename BlockMergeSortStrategyT::TempStorage &temp_storage) : BlockMergeSortStrategyT( temp_storage, RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} private: __device__ __forceinline__ void SyncImplementation() const { CTA_SYNC(); } friend BlockMergeSortStrategyT; }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_radix_rank.cuh000066400000000000000000001230761434614775400201530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block */ #pragma once #include #include "../thread/thread_reduce.cuh" #include "../thread/thread_scan.cuh" #include "../block/block_scan.cuh" #include "../block/radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \brief Radix ranking algorithm, the algorithm used to implement stable ranking of the * keys from a single tile. Note that different ranking algorithms require different * initial arrangements of keys to function properly. */ enum RadixRankAlgorithm { /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == false. It * uses thread-private histograms, and thus uses more shared memory. Requires blocked * arrangement of keys. Does not support count callbacks. */ RADIX_RANK_BASIC, /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == * true. Similar to RADIX_RANK BASIC, it requires blocked arrangement of * keys and does not support count callbacks.*/ RADIX_RANK_MEMOIZE, /** Ranking using the BlockRadixRankMatch algorithm. It uses warp-private * histograms and matching for ranking the keys in a single warp. Therefore, * it uses less shared memory compared to RADIX_RANK_BASIC. It requires * warp-striped key arrangement and supports count callbacks. */ RADIX_RANK_MATCH, /** Ranking using the BlockRadixRankMatchEarlyCounts algorithm with * MATCH_ALGORITHM == WARP_MATCH_ANY. An alternative implementation of * match-based ranking that computes bin counts early. Because of this, it * works better with onesweep sorting, which requires bin counts for * decoupled look-back. Assumes warp-striped key arrangement and supports * count callbacks.*/ RADIX_RANK_MATCH_EARLY_COUNTS_ANY, /** Ranking using the BlockRadixRankEarlyCounts algorithm with * MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR. It uses extra space in shared * memory to generate warp match masks using atomicOr(). This is faster when * there are few matches, but can lead to slowdowns if the number of * matching keys among warp lanes is high. Assumes warp-striped key * arrangement and supports count callbacks. */ RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR }; /** Empty callback implementation */ template struct BlockRadixRankEmptyCallback { __device__ __forceinline__ void operator()(int (&bins)[BINS_PER_THREAD]) {} }; /** * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. * \ingroup BlockModule * * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam RADIX_BITS The number of radix bits per digit place * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * Blah... * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). * - \blocked * * \par Performance Considerations * - \granularity * * \par Examples * \par * - Example 1: Simple radix rank of 32-bit integer keys * \code * #include * * template * __global__ void ExampleKernel(...) * { * * \endcode * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockRadixRank. */ template < int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = true, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockRadixRank { private: /****************************************************************************** * Type definitions and constants ******************************************************************************/ // Integer type for digit counters (to be packed into words of type PackedCounters) typedef unsigned short DigitCounter; // Integer type for packing DigitCounters into columns of shared memory banks using PackedCounter = cub::detail::conditional_t; enum { // The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, BYTES_PER_COUNTER = sizeof(DigitCounter), LOG_BYTES_PER_COUNTER = Log2::VALUE, PACKING_RATIO = static_cast(sizeof(PackedCounter) / sizeof(DigitCounter)), LOG_PACKING_RATIO = Log2::VALUE, LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), // Always at least one lane COUNTER_LANES = 1 << LOG_COUNTER_LANES, // The number of packed counters per thread (plus one for padding) PADDED_COUNTER_LANES = COUNTER_LANES + 1, RAKING_SEGMENT = PADDED_COUNTER_LANES, }; public: enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: /// BlockScan type typedef BlockScan< PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScan; /// Shared memory storage layout type for BlockRadixRank struct __align__(16) _TempStorage { union Aliasable { DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; } aliasable; // Storage for scanning local ranks typename BlockScan::TempStorage block_scan; }; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; /// Copy of raking segment, promoted to registers PackedCounter cached_segment[RAKING_SEGMENT]; /****************************************************************************** * Utility methods ******************************************************************************/ /** * Internal storage allocator */ __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /** * Performs upsweep raking reduction, returning the aggregate */ __device__ __forceinline__ PackedCounter Upsweep() { PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; PackedCounter *raking_ptr; if (MEMOIZE_OUTER_SCAN) { // Copy data into registers #pragma unroll for (int i = 0; i < RAKING_SEGMENT; i++) { cached_segment[i] = smem_raking_ptr[i]; } raking_ptr = cached_segment; } else { raking_ptr = smem_raking_ptr; } return internal::ThreadReduce(raking_ptr, Sum()); } /// Performs exclusive downsweep raking scan __device__ __forceinline__ void ExclusiveDownsweep( PackedCounter raking_partial) { PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? cached_segment : smem_raking_ptr; // Exclusive raking downsweep scan internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); if (MEMOIZE_OUTER_SCAN) { // Copy data back to smem #pragma unroll for (int i = 0; i < RAKING_SEGMENT; i++) { smem_raking_ptr[i] = cached_segment[i]; } } } /** * Reset shared memory digit counters */ __device__ __forceinline__ void ResetCounters() { // Reset shared memory digit counters #pragma unroll for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) { *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; } } /** * Block-scan prefix callback */ struct PrefixCallBack { __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate) { PackedCounter block_prefix = 0; // Propagate totals in packed fields #pragma unroll for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) { block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); } return block_prefix; } }; /** * Scan shared memory digit counters. */ __device__ __forceinline__ void ScanCounters() { // Upsweep scan PackedCounter raking_partial = Upsweep(); // Compute exclusive sum PackedCounter exclusive_partial; PrefixCallBack prefix_call_back; BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); // Downsweep scan with exclusive partial ExclusiveDownsweep(exclusive_partial); } public: /// \smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockRadixRank() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockRadixRank( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Raking *********************************************************************/ //@{ /** * \brief Rank keys. */ template < typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile DigitExtractorT digit_extractor) ///< [in] The digit extractor { DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem // Reset shared memory digit counters ResetCounters(); #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // Get digit unsigned int digit = digit_extractor.Digit(keys[ITEM]); // Get sub-counter unsigned int sub_counter = digit >> LOG_COUNTER_LANES; // Get counter lane unsigned int counter_lane = digit & (COUNTER_LANES - 1); if (IS_DESCENDING) { sub_counter = PACKING_RATIO - 1 - sub_counter; counter_lane = COUNTER_LANES - 1 - counter_lane; } // Pointer to smem digit counter digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; // Load thread-exclusive prefix thread_prefixes[ITEM] = *digit_counters[ITEM]; // Store inclusive prefix *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; } CTA_SYNC(); // Scan shared memory counters ScanCounters(); CTA_SYNC(); // Extract the local ranks of each key #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // Add in thread block exclusive prefix ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; } } /** * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. */ template < typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) DigitExtractorT digit_extractor, ///< [in] The digit extractor int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] { // Rank keys RankKeys(keys, ranks, digit_extractor); // Get the inclusive and exclusive digit totals corresponding to the calling thread. #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) bin_idx = RADIX_DIGITS - bin_idx - 1; // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the // first counter column, resulting in unavoidable bank conflicts.) unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; } } } }; /** * Radix-rank using match.any */ template < int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockRadixRankMatch { private: /****************************************************************************** * Type definitions and constants ******************************************************************************/ typedef int32_t RankT; typedef int32_t DigitCounterT; enum { // The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, PADDED_WARPS = ((WARPS & 0x1) == 0) ? WARPS + 1 : WARPS, COUNTERS = PADDED_WARPS * RADIX_DIGITS, RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? RAKING_SEGMENT + 1 : RAKING_SEGMENT, }; public: enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: /// BlockScan type typedef BlockScan< DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT; /// Shared memory storage layout type for BlockRadixRank struct __align__(16) _TempStorage { typename BlockScanT::TempStorage block_scan; union __align__(16) Aliasable { volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; } aliasable; }; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// \smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockRadixRankMatch( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Raking *********************************************************************/ //@{ /** \brief Computes the count of keys for each digit value, and calls the * callback with the array of key counts. * @tparam CountsCallback The callback type. It should implement an instance * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins * is an array of key counts for each digit value distributed in block * distribution among the threads of the thread block. Key counts can be * used, to update other data structures in global or shared * memory. Depending on the implementation of the ranking algoirhtm * (see BlockRadixRankMatchEarlyCounts), key counts may become available * early, therefore, they are returned through a callback rather than a * separate output parameter of RankKeys(). */ template __device__ __forceinline__ void CallBack(CountsCallback callback) { int bins[BINS_TRACKED_PER_THREAD]; // Get count for each digit #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; const int TILE_ITEMS = KEYS_PER_THREAD * BLOCK_THREADS; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; bins[track] = (bin_idx > 0 ? temp_storage.aliasable.warp_digit_counters[bin_idx - 1][0] : TILE_ITEMS) - temp_storage.aliasable.warp_digit_counters[bin_idx][0]; } else { bins[track] = (bin_idx < RADIX_DIGITS - 1 ? temp_storage.aliasable.warp_digit_counters[bin_idx + 1][0] : TILE_ITEMS) - temp_storage.aliasable.warp_digit_counters[bin_idx][0]; } } } callback(bins); } /** * \brief Rank keys. */ template < typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT, typename CountsCallback> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile DigitExtractorT digit_extractor, ///< [in] The digit extractor CountsCallback callback) { // Initialize shared digit counters #pragma unroll for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; CTA_SYNC(); // Each warp will strip-mine its section of input, one strip at a time volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; uint32_t lane_mask_lt = LaneMaskLt(); #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // My digit uint32_t digit = digit_extractor.Digit(keys[ITEM]); if (IS_DESCENDING) digit = RADIX_DIGITS - digit - 1; // Mask of peers who have same digit as me uint32_t peer_mask = MatchAny(digit); // Pointer to smem digit counter for this key digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; // Number of occurrences in previous strips DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; // Warp-sync WARP_SYNC(0xFFFFFFFF); // Number of peers having same digit as me int32_t digit_count = __popc(peer_mask); // Number of lower-ranked peers having same digit seen so far int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); if (peer_digit_prefix == 0) { // First thread for each digit updates the shared warp counter *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); } // Warp-sync WARP_SYNC(0xFFFFFFFF); // Number of prior keys having same digit ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); } CTA_SYNC(); // Scan warp counters DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; #pragma unroll for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); #pragma unroll for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; CTA_SYNC(); if (!std::is_same< CountsCallback, BlockRadixRankEmptyCallback>::value) { CallBack(callback); } // Seed ranks with counter values from previous warps #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) ranks[ITEM] += *digit_counters[ITEM]; } template < typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor) { RankKeys(keys, ranks, digit_extractor, BlockRadixRankEmptyCallback()); } /** * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. */ template < typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT, typename CountsCallback> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) DigitExtractorT digit_extractor, ///< [in] The digit extractor int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD], ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] CountsCallback callback) { RankKeys(keys, ranks, digit_extractor, callback); // Get exclusive count for each digit #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) bin_idx = RADIX_DIGITS - bin_idx - 1; exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; } } } template < typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] { RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix, BlockRadixRankEmptyCallback()); } }; enum WarpMatchAlgorithm { WARP_MATCH_ANY, WARP_MATCH_ATOMIC_OR }; /** * Radix-rank using matching which computes the counts of keys for each digit * value early, at the expense of doing more work. This may be useful e.g. for * decoupled look-back, where it reduces the time other thread blocks need to * wait for digit counts to become available. */ template struct BlockRadixRankMatchEarlyCounts { // constants enum { BLOCK_THREADS = BLOCK_DIM_X, RADIX_DIGITS = 1 << RADIX_BITS, BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS, BINS_TRACKED_PER_THREAD = BINS_PER_THREAD, FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS, WARP_THREADS = CUB_PTX_WARP_THREADS, BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS, WARP_MASK = ~0, NUM_MATCH_MASKS = MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR ? BLOCK_WARPS : 0, // Guard against declaring zero-sized array: MATCH_MASKS_ALLOC_SIZE = NUM_MATCH_MASKS < 1 ? 1 : NUM_MATCH_MASKS, }; // types typedef cub::BlockScan BlockScan; // temporary storage struct TempStorage { union { int warp_offsets[BLOCK_WARPS][RADIX_DIGITS]; int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS]; }; int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS]; typename BlockScan::TempStorage prefix_tmp; }; TempStorage& temp_storage; // internal ranking implementation template struct BlockRadixRankMatchInternal { TempStorage& s; DigitExtractorT digit_extractor; CountsCallback callback; int warp; int lane; __device__ __forceinline__ int Digit(UnsignedBits key) { int digit = digit_extractor.Digit(key); return IS_DESCENDING ? RADIX_DIGITS - 1 - digit : digit; } __device__ __forceinline__ int ThreadBin(int u) { int bin = threadIdx.x * BINS_PER_THREAD + u; return IS_DESCENDING ? RADIX_DIGITS - 1 - bin : bin; } __device__ __forceinline__ void ComputeHistogramsWarp(UnsignedBits (&keys)[KEYS_PER_THREAD]) { //int* warp_offsets = &s.warp_offsets[warp][0]; int (&warp_histograms)[RADIX_DIGITS][NUM_PARTS] = s.warp_histograms[warp]; // compute warp-private histograms #pragma unroll for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) { #pragma unroll for (int part = 0; part < NUM_PARTS; ++part) { warp_histograms[bin][part] = 0; } } if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR) { int* match_masks = &s.match_masks[warp][0]; #pragma unroll for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) { match_masks[bin] = 0; } } WARP_SYNC(WARP_MASK); // compute private per-part histograms int part = lane % NUM_PARTS; #pragma unroll for (int u = 0; u < KEYS_PER_THREAD; ++u) { atomicAdd(&warp_histograms[Digit(keys[u])][part], 1); } // sum different parts; // no extra work is necessary if NUM_PARTS == 1 if (NUM_PARTS > 1) { WARP_SYNC(WARP_MASK); // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary const int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS; int bins[WARP_BINS_PER_THREAD]; #pragma unroll for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) { int bin = lane + u * WARP_THREADS; bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum()); } CTA_SYNC(); // store the resulting histogram in shared memory int* warp_offsets = &s.warp_offsets[warp][0]; #pragma unroll for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) { int bin = lane + u * WARP_THREADS; warp_offsets[bin] = bins[u]; } } } __device__ __forceinline__ void ComputeOffsetsWarpUpsweep(int (&bins)[BINS_PER_THREAD]) { // sum up warp-private histograms #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { bins[u] = 0; int bin = ThreadBin(u); if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS)) { #pragma unroll for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp) { int warp_offset = s.warp_offsets[j_warp][bin]; s.warp_offsets[j_warp][bin] = bins[u]; bins[u] += warp_offset; } } } } __device__ __forceinline__ void ComputeOffsetsWarpDownsweep(int (&offsets)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS)) { int digit_offset = offsets[u]; #pragma unroll for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp) { s.warp_offsets[j_warp][bin] += digit_offset; } } } } __device__ __forceinline__ void ComputeRanksItem( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], Int2Type) { // compute key ranks int lane_mask = 1 << lane; int* warp_offsets = &s.warp_offsets[warp][0]; int* match_masks = &s.match_masks[warp][0]; #pragma unroll for (int u = 0; u < KEYS_PER_THREAD; ++u) { int bin = Digit(keys[u]); int* p_match_mask = &match_masks[bin]; atomicOr(p_match_mask, lane_mask); WARP_SYNC(WARP_MASK); int bin_mask = *p_match_mask; int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; int popc = __popc(bin_mask & LaneMaskLe()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask); if (lane == leader) *p_match_mask = 0; WARP_SYNC(WARP_MASK); ranks[u] = warp_offset + popc - 1; } } __device__ __forceinline__ void ComputeRanksItem( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], Int2Type) { // compute key ranks int* warp_offsets = &s.warp_offsets[warp][0]; #pragma unroll for (int u = 0; u < KEYS_PER_THREAD; ++u) { int bin = Digit(keys[u]); int bin_mask = MatchAny(bin); int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; int popc = __popc(bin_mask & LaneMaskLe()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask); ranks[u] = warp_offset + popc - 1; } } __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], int (&exclusive_digit_prefix)[BINS_PER_THREAD]) { ComputeHistogramsWarp(keys); CTA_SYNC(); int bins[BINS_PER_THREAD]; ComputeOffsetsWarpUpsweep(bins); callback(bins); BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix); ComputeOffsetsWarpDownsweep(exclusive_digit_prefix); CTA_SYNC(); ComputeRanksItem(keys, ranks, Int2Type()); } __device__ __forceinline__ BlockRadixRankMatchInternal (TempStorage& temp_storage, DigitExtractorT digit_extractor, CountsCallback callback) : s(temp_storage), digit_extractor(digit_extractor), callback(callback), warp(threadIdx.x / WARP_THREADS), lane(LaneId()) {} }; __device__ __forceinline__ BlockRadixRankMatchEarlyCounts (TempStorage& temp_storage) : temp_storage(temp_storage) {} /** * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. */ template __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_PER_THREAD], CountsCallback callback) { BlockRadixRankMatchInternal internal(temp_storage, digit_extractor, callback); internal.RankKeys(keys, ranks, exclusive_digit_prefix); } template __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_PER_THREAD]) { typedef BlockRadixRankEmptyCallback CountsCallback; BlockRadixRankMatchInternal internal(temp_storage, digit_extractor, CountsCallback()); internal.RankKeys(keys, ranks, exclusive_digit_prefix); } template __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor) { int exclusive_digit_prefix[BINS_PER_THREAD]; RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_radix_sort.cuh000066400000000000000000001162211434614775400202010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. */ #pragma once #include "block_exchange.cuh" #include "block_radix_rank.cuh" #include "radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) * \ingroup BlockModule * * \tparam KeyT KeyT type * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of items per thread * \tparam ValueT [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges * items into ascending order. It relies upon a positional representation for * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, * characters, etc.) specified from least-significant to most-significant. For a * given input sequence of keys and a set of rules specifying a total ordering * of the symbolic alphabet, the radix sorting method produces a lexicographic * ordering of those keys. * * \rowmajor * * \par Supported Types * BlockRadixSort can sort all of the built-in C++ numeric primitive types * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half * half-precision floating-point type. * * \par Floating-Point Special Cases * * - Positive and negative zeros are considered equivalent, and will be treated * as such in the output. * - No special handling is implemented for NaN values; these are sorted * according to their bit representations after any transformations. * * \par Bitwise Key Transformations * Although the direct radix sorting method can only be applied to unsigned * integral types, BlockRadixSort is able to sort signed and floating-point * types via simple bit-wise transformations that ensure lexicographic key * ordering. * * These transformations must be considered when restricting the * `[begin_bit, end_bit)` range, as the bitwise transformations will occur * before the bit-range truncation. * * Any transformations applied to the keys prior to sorting are reversed * while writing to the final output buffer. * * \par Type Specific Bitwise Transformations * To convert the input values into a radix-sortable bitwise representation, * the following transformations take place prior to sorting: * * - For unsigned integral values, the keys are used directly. * - For signed integral values, the sign bit is inverted. * - For positive floating point values, the sign bit is inverted. * - For negative floating point values, the full key is inverted. * * \par No Descending Sort Transformations * Unlike `DeviceRadixSort`, `BlockRadixSort` does not invert the input key bits * when performing a descending sort. Instead, it has special logic to reverse * the order of the keys while sorting. * * \par Stability * BlockRadixSort is stable. For floating-point types -0.0 and +0.0 * are considered equal and appear in the result in the same order as they * appear in the input. * * * \par Performance Considerations * - \granularity * * \par A Simple Example * \blockcollective{BlockRadixSort} * \par * The code snippet below illustrates a sort of 512 integer keys that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * // Collectively sort the keys * BlockRadixSort(temp_storage).Sort(thread_keys); * * ... * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockRadixSort. */ template < typename KeyT, int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = true, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockRadixSort { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { // The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, // Whether or not there are values to be trucked along with keys KEYS_ONLY = std::is_same::value, }; // KeyT traits and unsigned bits type typedef Traits KeyTraits; typedef typename KeyTraits::UnsignedBits UnsignedBits; /// Ascending BlockRadixRank utility type typedef BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, false, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z> AscendingBlockRadixRank; /// Descending BlockRadixRank utility type typedef BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, true, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z> DescendingBlockRadixRank; /// Digit extractor type typedef BFEDigitExtractor DigitExtractorT; /// BlockExchange utility type for keys typedef BlockExchange BlockExchangeKeys; /// BlockExchange utility type for values typedef BlockExchange BlockExchangeValues; /// Shared memory storage layout type union _TempStorage { typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; typename BlockExchangeKeys::TempStorage exchange_keys; typename BlockExchangeValues::TempStorage exchange_values; }; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Rank keys (specialized for ascending sort) __device__ __forceinline__ void RankKeys( UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], DigitExtractorT digit_extractor, Int2Type /*is_descending*/) { AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( unsigned_keys, ranks, digit_extractor); } /// Rank keys (specialized for descending sort) __device__ __forceinline__ void RankKeys( UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], DigitExtractorT digit_extractor, Int2Type /*is_descending*/) { DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( unsigned_keys, ranks, digit_extractor); } /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) __device__ __forceinline__ void ExchangeValues( ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { CTA_SYNC(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); } /// ExchangeValues (specialized for key-value sort, to-striped arrangement) __device__ __forceinline__ void ExchangeValues( ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { CTA_SYNC(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); } /// ExchangeValues (specialized for keys-only sort) template __device__ __forceinline__ void ExchangeValues( ValueT (&/*values*/)[ITEMS_PER_THREAD], int (&/*ranks*/)[ITEMS_PER_THREAD], Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) {} /// Sort blocked arrangement template __device__ __forceinline__ void SortBlocked( KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison Int2Type is_descending, ///< Tag whether is a descending-order sort Int2Type is_keys_only) ///< Tag whether is keys-only sort { UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); // Twiddle bits if necessary #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); } // Radix sorting passes while (true) { int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); DigitExtractorT digit_extractor(begin_bit, pass_bits); // Rank the blocked keys int ranks[ITEMS_PER_THREAD]; RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; CTA_SYNC(); // Exchange keys through shared memory in blocked arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); // Exchange values through shared memory in blocked arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); // Quit if done if (begin_bit >= end_bit) break; CTA_SYNC(); } // Untwiddle bits if necessary #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); } } public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Sort blocked -> striped arrangement template __device__ __forceinline__ void SortBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison Int2Type is_descending, ///< Tag whether is a descending-order sort Int2Type is_keys_only) ///< Tag whether is keys-only sort { UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); // Twiddle bits if necessary #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); } // Radix sorting passes while (true) { int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); DigitExtractorT digit_extractor(begin_bit, pass_bits); // Rank the blocked keys int ranks[ITEMS_PER_THREAD]; RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; CTA_SYNC(); // Check if this is the last pass if (begin_bit >= end_bit) { // Last pass exchanges keys through shared memory in striped arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); // Last pass exchanges through shared memory in striped arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); // Quit break; } // Exchange keys through shared memory in blocked arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); // Exchange values through shared memory in blocked arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); CTA_SYNC(); } // Untwiddle bits if necessary #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); } } #endif // DOXYGEN_SHOULD_SKIP_THIS /// \smemstorage{BlockRadixSort} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockRadixSort() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockRadixSort( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Sorting (blocked arrangements) *********************************************************************/ //@{ /** * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. * * \par * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive keys. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * // Collectively sort the keys * BlockRadixSort(temp_storage).Sort(thread_keys); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. * The corresponding output \p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. */ __device__ __forceinline__ void Sort( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { NullType values[ITEMS_PER_THREAD]; SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } /** * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. * * \par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive pairs. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * int thread_values[4]; * ... * * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * */ __device__ __forceinline__ void Sort( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } /** * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. * * \par * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive keys. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * // Collectively sort the keys * BlockRadixSort(temp_storage).Sort(thread_keys); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. * The corresponding output \p thread_keys in those threads will be * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. */ __device__ __forceinline__ void SortDescending( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { NullType values[ITEMS_PER_THREAD]; SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } /** * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. * * \par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive pairs. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * int thread_values[4]; * ... * * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. * */ __device__ __forceinline__ void SortDescending( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //@} end member group /******************************************************************//** * \name Sorting (blocked arrangement -> striped arrangement) *********************************************************************/ //@{ /** * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). * * \par * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys that * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive keys. The final partitioning is striped. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * // Collectively sort the keys * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. * */ __device__ __forceinline__ void SortBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } /** * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). * * \par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive pairs. The final partitioning is striped. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * int thread_values[4]; * ... * * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. * */ __device__ __forceinline__ void SortBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } /** * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). * * \par * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys that * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive keys. The final partitioning is striped. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * // Collectively sort the keys * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. * */ __device__ __forceinline__ void SortDescendingBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } /** * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). * * \par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive pairs. The final partitioning is striped. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each * typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * int thread_values[4]; * ... * * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); * * \endcode * \par * Suppose the set of input \p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The * corresponding output \p thread_keys in those threads will be * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. * */ __device__ __forceinline__ void SortDescendingBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //@} end member group }; /** * \example example_block_radix_sort.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/block/block_raking_layout.cuh000066400000000000000000000135141434614775400206740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. */ #pragma once #include "../config.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) * \ingroup BlockModule * * \par Overview * This type facilitates a shared memory usage pattern where a block of CUDA * threads places elements into shared memory and then reduces the active * parallelism to one "raking" warp of threads for serially aggregating consecutive * sequences of shared items. Padding is inserted to eliminate bank conflicts * (for most data types). * * \tparam T The data type to be exchanged. * \tparam BLOCK_THREADS The thread block size in threads. * \tparam LEGACY_PTX_ARCH [optional] Unused. */ template < typename T, int BLOCK_THREADS, int LEGACY_PTX_ARCH = 0> struct BlockRakingLayout { //--------------------------------------------------------------------- // Constants and type definitions //--------------------------------------------------------------------- enum { /// The total number of elements that need to be cooperatively reduced SHARED_ELEMENTS = BLOCK_THREADS, /// Maximum number of warp-synchronous raking threads MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)), /// Number of raking elements per warp-synchronous raking thread (rounded up) SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0), /// Degree of bank conflicts (e.g., 4-way) CONFLICT_DEGREE = (HAS_CONFLICTS) ? (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) : 1, /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), /// Total number of elements in the raking grid GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), }; /** * \brief Shared memory storage type */ struct __align__(16) _TempStorage { T buff[BlockRakingLayout::GRID_ELEMENTS]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /** * \brief Returns the location for the calling thread to place data into the grid */ static __device__ __forceinline__ T* PlacementPtr( TempStorage &temp_storage, unsigned int linear_tid) { // Offset for partial unsigned int offset = linear_tid; // Add in one padding element for every segment if (USE_SEGMENT_PADDING > 0) { offset += offset / SEGMENT_LENGTH; } // Incorporating a block of padding partials every shared memory segment return temp_storage.Alias().buff + offset; } /** * \brief Returns the location for the calling thread to begin sequential raking */ static __device__ __forceinline__ T* RakingPtr( TempStorage &temp_storage, unsigned int linear_tid) { return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_reduce.cuh000066400000000000000000000616461434614775400173040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. */ #pragma once #include "specializations/block_reduce_raking.cuh" #include "specializations/block_reduce_raking_commutative_only.cuh" #include "specializations/block_reduce_warp_reductions.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" #include "../thread/thread_operators.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ /** * BlockReduceAlgorithm enumerates alternative algorithms for parallel * reduction across a CUDA thread block. */ enum BlockReduceAlgorithm { /** * \par Overview * An efficient "raking" reduction algorithm that only supports commutative * reduction operators (true for most operations, e.g., addition). * * \par * Execution is comprised of three phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Threads in warps other than the first warp place * their partial reductions into shared memory. * -# Upsweep sequential reduction in shared memory. Threads within the first * warp continue to accumulate by raking across segments of shared partial reductions * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * * \par * \image html block_reduce.png *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* * \par Performance Considerations * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE * and is preferable when the reduction operator is commutative. This variant * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall * throughput across the GPU when suitably occupied. However, turn-around latency may be * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable * when the GPU is under-occupied. */ BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, /** * \par Overview * An efficient "raking" reduction algorithm that supports commutative * (e.g., addition) and non-commutative (e.g., string concatenation) reduction * operators. \blocked. * * \par * Execution is comprised of three phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Each thread then places the partial reduction * of its item(s) into shared memory. * -# Upsweep sequential reduction in shared memory. Threads within a * single warp rake across segments of shared partial reductions. * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * * \par * \image html block_reduce.png *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* * \par Performance Considerations * - This variant performs more communication than BLOCK_REDUCE_RAKING * and is only preferable when the reduction operator is non-commutative. This variant * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall * throughput across the GPU when suitably occupied. However, turn-around latency may be * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable * when the GPU is under-occupied. */ BLOCK_REDUCE_RAKING, /** * \par Overview * A quick "tiled warp-reductions" reduction algorithm that supports commutative * (e.g., addition) and non-commutative (e.g., string concatenation) reduction * operators. * * \par * Execution is comprised of four phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Each thread then places the partial reduction * of its item(s) into shared memory. * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style * reduction within each warp. * -# A propagation phase where the warp reduction outputs in each warp are * updated with the aggregate from each preceding warp. * * \par * \image html block_scan_warpscans.png *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* * \par Performance Considerations * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall * throughput across the GPU. However turn-around latency may be lower and * thus useful when the GPU is under-occupied. */ BLOCK_REDUCE_WARP_REDUCTIONS, }; /****************************************************************************** * Block reduce ******************************************************************************/ /** * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) * \ingroup BlockModule * * \tparam T Data type being reduced * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A reduction (or fold) * uses a binary combining operator to compute a single aggregate from a list of input elements. * - \rowmajor * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) * * \par Performance Considerations * - \granularity * - Very efficient (only one synchronization barrier). * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Summation (vs. generic reduction) * - \p BLOCK_THREADS is a multiple of the architecture's warp size * - Every thread has a valid input (i.e., full vs. partial-tiles) * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives * * \par A Simple Example * \blockcollective{BlockReduce} * \par * The code snippet below illustrates a sum reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * * \endcode * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu */ template < typename T, int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockReduce { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; typedef BlockReduceWarpReductions WarpReductions; typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; typedef BlockReduceRaking Raking; /// Internal specialization type using InternalBlockReduce = cub::detail::conditional_t< ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS, WarpReductions, cub::detail::conditional_t>; // BlockReduceRaking /// Shared memory storage layout type for BlockReduce typedef typename InternalBlockReduce::TempStorage _TempStorage; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// \smemstorage{BlockReduce} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockReduce() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockReduce( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Generic reductions *********************************************************************/ //@{ /** * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. * * \par * - The return value is undefined in threads other than thread0. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item * int thread_data; * ... * * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); * * \endcode * * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input ReductionOp reduction_op) ///< [in] Binary reduction functor { return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); } /** * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. * * \par * - The return value is undefined in threads other than thread0. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); * * \endcode * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ReductionOp> __device__ __forceinline__ T Reduce( T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment ReductionOp reduction_op) ///< [in] Binary reduction functor { // Reduce partials T partial = internal::ThreadReduce(inputs, reduction_op); return Reduce(partial, reduction_op); } /** * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. * * \par * - The return value is undefined in threads other than thread0. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction of a partially-full tile of integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int num_valid, ...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item * int thread_data; * if (threadIdx.x < num_valid) thread_data = ... * * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); * * \endcode * * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input ReductionOp reduction_op, ///< [in] Binary reduction functor int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) { // Determine if we scan skip bounds checking if (num_valid >= BLOCK_THREADS) { return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); } else { return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); } } //@} end member group /******************************************************************//** * \name Summation reductions *********************************************************************/ //@{ /** * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. * * \par * - The return value is undefined in threads other than thread0. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item * int thread_data; * ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * * \endcode * */ __device__ __forceinline__ T Sum( T input) ///< [in] Calling thread's input { return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); } /** * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. * * \par * - The return value is undefined in threads other than thread0. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * * \endcode * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. */ template __device__ __forceinline__ T Sum( T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment { // Reduce partials T partial = internal::ThreadReduce(inputs, cub::Sum()); return Sum(partial); } /** * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. * * \par * - The return value is undefined in threads other than thread0. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int num_valid, ...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item (up to num_items) * int thread_data; * if (threadIdx.x < num_valid) * thread_data = ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); * * \endcode * */ __device__ __forceinline__ T Sum( T input, ///< [in] Calling thread's input int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) { // Determine if we scan skip bounds checking if (num_valid >= BLOCK_THREADS) { return InternalBlockReduce(temp_storage).template Sum(input, num_valid); } else { return InternalBlockReduce(temp_storage).template Sum(input, num_valid); } } //@} end member group }; /** * \example example_block_reduce.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/block/block_run_length_decode.cuh000066400000000000000000000450051434614775400214740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include "../config.cuh" #include "../thread/thread_search.cuh" #include "../util_math.cuh" #include "../util_namespace.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" #include "block_scan.cuh" #include #include CUB_NAMESPACE_BEGIN /** * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output * array. * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned. * * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). * A run of length zero may not be followed by a run length that is not zero. * * \par * \code * __global__ void ExampleKernel(...) * { * // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t * using RunItemT = uint64_t; * // Type large enough to index into the run-length decoded array * using RunLengthT = uint32_t; * * // Specialising BlockRunLengthDecode for a 1D block of 128 threads * constexpr int BLOCK_DIM_X = 128; * // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs * constexpr int RUNS_PER_THREAD = 2; * // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items * constexpr int DECODED_ITEMS_PER_THREAD = 4; * * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each * using BlockRunLengthDecodeT = * cub::BlockRunLengthDecode; * * // Allocate shared memory for BlockRunLengthDecode * __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage; * * // The run-length encoded items and how often they shall be repeated in the run-length decoded output * RunItemT run_values[RUNS_PER_THREAD]; * RunLengthT run_lengths[RUNS_PER_THREAD]; * ... * * // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode * uint32_t total_decoded_size = 0; * BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size); * * // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs * // have been decoded. * uint32_t decoded_window_offset = 0U; * while (decoded_window_offset < total_decoded_size) * { * RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; * RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; * * // The number of decoded items that are valid within this window (aka pass) of run-length decoding * uint32_t num_valid_items = total_decoded_size - decoded_window_offset; * block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset); * * decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD; * * ... * } * } * \endcode * \par * Suppose the set of input \p run_values across the block of threads is * { [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] } and * \p run_lengths is { [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }. * The corresponding output \p decoded_items in those threads will be { [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], * [4, 4, 4, 5], ..., [169, 169, 170, 171] } and \p relative_offsets will be { [0, 0, 1, 0], [1, 2, 0, 1], [2, * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] } during the first iteration of the while loop. * * \tparam ItemT The data type of the items being run-length decoded * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the * runs' lengths) * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension */ template class BlockRunLengthDecode { //--------------------------------------------------------------------- // CONFIGS & TYPE ALIASES //--------------------------------------------------------------------- private: /// The thread block size in threads static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0') static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD; /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length) using RunOffsetScanT = BlockScan; /// Type used to index into the block's runs using RunOffsetT = uint32_t; /// Shared memory type required by this thread block union _TempStorage { typename RunOffsetScanT::TempStorage offset_scan; struct { ItemT run_values[BLOCK_RUNS]; DecodedOffsetT run_offsets[BLOCK_RUNS]; } runs; }; // union TempStorage /// Internal storage allocator (used when the user does not provide pre-allocated shared memory) __device__ __forceinline__ _TempStorage &PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id uint32_t linear_tid; public: struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // CONSTRUCTOR //--------------------------------------------------------------------- /** * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The * algorithm's temporary storage may not be repurposed between the constructor call and subsequent * RunLengthDecode calls. */ template __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, ItemT (&run_values)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT &total_decoded_size) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunLengths(run_values, run_lengths, total_decoded_size); } /** * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The * algorithm's temporary storage may not be repurposed between the constructor call and subsequent * RunLengthDecode calls. */ template __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunOffsets(run_values, run_offsets); } /** * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths. */ template __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT &total_decoded_size) : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunLengths(run_values, run_lengths, total_decoded_size); } /** * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets. */ template __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunOffsets(run_values, run_offsets); } private: /** * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search * iterations at compile time. */ template __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence OffsetT num_items, ///< [in] Input sequence length T val) ///< [in] Search key { OffsetT lower_bound = 0; OffsetT upper_bound = num_items; #pragma unroll for (int i = 0; i <= Log2::VALUE; i++) { OffsetT mid = cub::MidPoint(lower_bound, upper_bound); mid = (cub::min)(mid, num_items - 1); if (val < input[mid]) { upper_bound = mid; } else { lower_bound = mid + 1; } } return lower_bound; } template __device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD], RunOffsetT (&run_offsets)[RUNS_PER_THREAD]) { // Keep the runs' items and the offsets of each run's beginning in the temporary storage RunOffsetT thread_dst_offset = static_cast(linear_tid) * static_cast(RUNS_PER_THREAD); #pragma unroll for (int i = 0; i < RUNS_PER_THREAD; i++) { temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i]; thread_dst_offset++; } // Ensure run offsets and run values have been writen to shared memory CTA_SYNC(); } template __device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT &total_decoded_size) { // Compute the offset for the beginning of each run DecodedOffsetT run_offsets[RUNS_PER_THREAD]; #pragma unroll for (int i = 0; i < RUNS_PER_THREAD; i++) { run_offsets[i] = static_cast(run_lengths[i]); } DecodedOffsetT decoded_size_aggregate; RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate); total_decoded_size = static_cast(decoded_size_aggregate); // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) CTA_SYNC(); InitWithRunOffsets(run_values, run_offsets); } public: /** * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within * the buffer are returned. Subsequent calls to RunLengthDecode adjusting \p from_decoded_offset can be * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to * RunLengthDecode is not required. * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`. * \smemreuse * * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results * in undefined behavior. */ template __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset = 0) { // The (global) offset of the first item decoded by this thread DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD; // The run that the first decoded item of this thread belongs to // If this thread's is already beyond the total decoded size, it will be assigned to the // last run RunOffsetT assigned_run = StaticUpperBound(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) - static_cast(1U); DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD : temp_storage.runs.run_offsets[assigned_run + 1]; ItemT val = temp_storage.runs.run_values[assigned_run]; #pragma unroll for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++) { decoded_items[i] = val; item_offsets[i] = thread_decoded_offset - assigned_run_begin; if (thread_decoded_offset == assigned_run_end - 1) { // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by // extending the last run's length to all the thread's item assigned_run++; assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD : temp_storage.runs.run_offsets[assigned_run + 1]; val = temp_storage.runs.run_values[assigned_run]; } thread_decoded_offset++; } } /** * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within * the buffer are returned. Subsequent calls to RunLengthDecode adjusting \p from_decoded_offset can be * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to * RunLengthDecode is not required. * * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results * in undefined behavior. */ __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset = 0) { DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD]; RunLengthDecode(decoded_items, item_offsets, from_decoded_offset); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_scan.cuh000066400000000000000000003114251434614775400167520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. */ #pragma once #include "specializations/block_scan_raking.cuh" #include "specializations/block_scan_warp_scans.cuh" #include "../config.cuh" #include "../util_type.cuh" #include "../util_ptx.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ /** * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. */ enum BlockScanAlgorithm { /** * \par Overview * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. * * \par * \image html block_scan_raking.png *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* * \par Performance Considerations * - Although this variant may suffer longer turnaround latencies when the * GPU is under-occupied, it can often provide higher overall throughput * across the GPU when suitably occupied. */ BLOCK_SCAN_RAKING, /** * \par Overview * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at * the expense of higher register pressure. Raking threads preserve their * "upsweep" segment of values in registers while performing warp-synchronous * scan, allowing the "downsweep" not to re-read them from shared memory. */ BLOCK_SCAN_RAKING_MEMOIZE, /** * \par Overview * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. * * \par * \image html block_scan_warpscans.png *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* * \par Performance Considerations * - Although this variant may suffer lower overall throughput across the * GPU because due to a heavy reliance on inefficient warpscans, it can * often provide lower turnaround latencies when the GPU is under-occupied. */ BLOCK_SCAN_WARP_SCANS, }; /****************************************************************************** * Block scan ******************************************************************************/ /** * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) * \ingroup BlockModule * * \tparam T Data type being scanned * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) * produces an output list where each element is computed to be the reduction * of the elements occurring earlier in the input list. Prefix sum * connotes a prefix scan with the addition operator. The term \em inclusive indicates * that the ith output reduction incorporates the ith input. * The term \em exclusive indicates the ith input is not incorporated into * the ith output reduction. * - \rowmajor * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) * * \par Performance Considerations * - \granularity * - Uses special instructions when applicable (e.g., warp \p SHFL) * - Uses synchronization-free communication between warp lanes when applicable * - Invokes a minimal number of minimal block-wide synchronization barriers (only * one or two depending on algorithm selection) * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Prefix sum variants (vs. generic scan) * - \blocksize * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives * * \par A Simple Example * \blockcollective{BlockScan} * \par * The code snippet below illustrates an exclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. * The corresponding output \p thread_data in those threads will be * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockScan. */ template < typename T, int BLOCK_DIM_X, BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockScan { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /** * Ensure the template parameterization meets the requirements of the * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy * cannot be used with thread block sizes not a multiple of the * architectural warp size. */ static const BlockScanAlgorithm SAFE_ALGORITHM = ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(0) != 0)) ? BLOCK_SCAN_RAKING : ALGORITHM; typedef BlockScanWarpScans WarpScans; typedef BlockScanRaking Raking; /// Define the delegate type for the desired algorithm using InternalBlockScan = cub::detail::conditional_t< SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>; /// Shared memory storage layout type for BlockScan typedef typename InternalBlockScan::TempStorage _TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /****************************************************************************** * Public types ******************************************************************************/ public: /// \smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockScan() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockScan( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Exclusive prefix sum operations *********************************************************************/ //@{ /** * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. * * \par * - \identityzero * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix sum of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. * */ __device__ __forceinline__ void ExclusiveSum( T input, ///< [in] Calling thread's input item T &output) ///< [out] Calling thread's output item (may be aliased to \p input) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } /** * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - \identityzero * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix sum of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide exclusive prefix sum * int block_aggregate; * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. * */ __device__ __forceinline__ void ExclusiveSum( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } /** * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - \identityzero * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total += block_aggregate; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockScan for a 1D block of 128 threads * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(0); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data = d_data[block_offset]; * * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum( * thread_data, thread_data, prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } * \endcode * \par * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... * The corresponding output for the first segment will be 0, 1, ..., 127. * The output for the second segment will be 128, 129, ..., 255. * * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template __device__ __forceinline__ void ExclusiveSum( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //@} end member group /******************************************************************//** * \name Exclusive prefix sum operations (multiple data per thread) *********************************************************************/ //@{ /** * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. * * \par * - \identityzero * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. */ template __device__ __forceinline__ void ExclusiveSum( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } /** * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - \identityzero * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide exclusive prefix sum * int block_aggregate; * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. */ template __device__ __forceinline__ void ExclusiveSum( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { // Reduce consecutive thread items in registers T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } /** * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - \identityzero * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) * across 128 threads where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total += block_aggregate; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread * typedef cub::BlockLoad BlockLoad; * typedef cub::BlockStore BlockStore; * typedef cub::BlockScan BlockScan; * * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan * __shared__ union { * typename BlockLoad::TempStorage load; * typename BlockScan::TempStorage scan; * typename BlockStore::TempStorage store; * } temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(0); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); * CTA_SYNC(); * * // Collectively compute the block-wide exclusive prefix sum * int block_aggregate; * BlockScan(temp_storage.scan).ExclusiveSum( * thread_data, thread_data, prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } * \endcode * \par * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template < int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp> __device__ __forceinline__ void ExclusiveSum( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //@} end member group // Exclusive prefix sums /******************************************************************//** * \name Exclusive prefix scan operations *********************************************************************/ //@{ /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. * * \par * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide exclusive prefix max scan * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) ScanOp scan_op) ///< [in] Binary scan functor { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); } /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide exclusive prefix max scan * int block_aggregate; * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input items T &output, ///< [out] Calling thread's output items (may be aliased to \p input) T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockScan for a 1D block of 128 threads * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(INT_MIN); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data = d_data[block_offset]; * * // Collectively compute the block-wide exclusive prefix max scan * BlockScan(temp_storage).ExclusiveScan( * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } * \endcode * \par * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); } //@} end member group // Inclusive prefix sums /******************************************************************//** * \name Exclusive prefix scan operations (multiple data per thread) *********************************************************************/ //@{ /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. * * \par * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide exclusive prefix max scan * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. * The corresponding output \p thread_data in those threads will be * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ScanOp> __device__ __forceinline__ void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) ScanOp scan_op) ///< [in] Binary scan functor { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide exclusive prefix max scan * int block_aggregate; * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ScanOp> __device__ __forceinline__ void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread * typedef cub::BlockLoad BlockLoad; * typedef cub::BlockStore BlockStore; * typedef cub::BlockScan BlockScan; * * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan * __shared__ union { * typename BlockLoad::TempStorage load; * typename BlockScan::TempStorage scan; * typename BlockStore::TempStorage store; * } temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(0); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); * CTA_SYNC(); * * // Collectively compute the block-wide exclusive prefix max scan * BlockScan(temp_storage.scan).ExclusiveScan( * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } * \endcode * \par * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template < int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans /******************************************************************//** * \name Exclusive prefix scan operations (no initial value, single datum per thread) *********************************************************************/ //@{ /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. * * \par * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan functor { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); } /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. * * \par * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); } //@} end member group /******************************************************************//** * \name Exclusive prefix scan operations (no initial value, multiple data per thread) *********************************************************************/ //@{ /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. * * \par * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ScanOp> __device__ __forceinline__ void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan functor { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_partial, thread_partial, scan_op); // Exclusive scan in registers with prefix internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } /** * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. * * \par * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ScanOp> __device__ __forceinline__ void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); // Exclusive scan in registers with prefix internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } //@} end member group #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans /******************************************************************//** * \name Inclusive prefix sum operations *********************************************************************/ //@{ /** * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. * * \par * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix sum of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide inclusive prefix sum * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. * */ __device__ __forceinline__ void InclusiveSum( T input, ///< [in] Calling thread's input item T &output) ///< [out] Calling thread's output item (may be aliased to \p input) { InclusiveScan(input, output, cub::Sum()); } /** * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix sum of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide inclusive prefix sum * int block_aggregate; * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. * */ __device__ __forceinline__ void InclusiveSum( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { InclusiveScan(input, output, cub::Sum(), block_aggregate); } /** * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total += block_aggregate; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockScan for a 1D block of 128 threads * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(0); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data = d_data[block_offset]; * * // Collectively compute the block-wide inclusive prefix sum * BlockScan(temp_storage).InclusiveSum( * thread_data, thread_data, prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } * \endcode * \par * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... * The corresponding output for the first segment will be 1, 2, ..., 128. * The output for the second segment will be 129, 130, ..., 256. * * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template __device__ __forceinline__ void InclusiveSum( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //@} end member group /******************************************************************//** * \name Inclusive prefix sum operations (multiple data per thread) *********************************************************************/ //@{ /** * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. * * \par * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide inclusive prefix sum * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. */ template __device__ __forceinline__ void InclusiveSum( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0]); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } /** * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide inclusive prefix sum * int block_aggregate; * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The * corresponding output \p thread_data in those threads will be * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void InclusiveSum( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0], block_aggregate); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } /** * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) * across 128 threads where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total += block_aggregate; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread * typedef cub::BlockLoad BlockLoad; * typedef cub::BlockStore BlockStore; * typedef cub::BlockScan BlockScan; * * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan * __shared__ union { * typename BlockLoad::TempStorage load; * typename BlockScan::TempStorage scan; * typename BlockStore::TempStorage store; * } temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(0); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); * CTA_SYNC(); * * // Collectively compute the block-wide inclusive prefix sum * BlockScan(temp_storage.scan).IncluisveSum( * thread_data, thread_data, prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } * \endcode * \par * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template < int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp> __device__ __forceinline__ void InclusiveSum( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0], block_prefix_callback_op); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } } //@} end member group /******************************************************************//** * \name Inclusive prefix scan operations *********************************************************************/ //@{ /** * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. * * \par * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide inclusive prefix max scan * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan functor { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); } /** * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain input item for each thread * int thread_data; * ... * * // Collectively compute the block-wide inclusive prefix max scan * int block_aggregate; * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); } /** * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockScan for a 1D block of 128 threads * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(INT_MIN); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data = d_data[block_offset]; * * // Collectively compute the block-wide inclusive prefix max scan * BlockScan(temp_storage).InclusiveScan( * thread_data, thread_data, cub::Max(), prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } * \endcode * \par * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. * * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); } //@} end member group /******************************************************************//** * \name Inclusive prefix scan operations (multiple data per thread) *********************************************************************/ //@{ /** * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. * * \par * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide inclusive prefix max scan * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ScanOp> __device__ __forceinline__ void InclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan functor { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op); // Inclusive scan in registers with prefix as seed (first thread does not seed) internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } /** * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockScan for a 1D block of 128 threads of type int * typedef cub::BlockScan BlockScan; * * // Allocate shared memory for BlockScan * __shared__ typename BlockScan::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Collectively compute the block-wide inclusive prefix max scan * int block_aggregate; * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. * The corresponding output \p thread_data in those threads will be * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ScanOp> __device__ __forceinline__ void InclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op, block_aggregate); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan (with no initial value) ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); // Inclusive scan in registers with prefix as seed (first thread does not seed) internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } /** * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. * * \par * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. * The functor will be invoked by the first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. * - \blocked * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. * \par * \code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied * // during consecutive scan operations. * struct BlockPrefixCallbackOp * { * // Running prefix * int running_total; * * // Constructor * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} * * // Callback operator to be entered by the first warp of threads in the block. * // Thread-0 is responsible for returning a value for seeding the block-wide scan. * __device__ int operator()(int block_aggregate) * { * int old_prefix = running_total; * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; * return old_prefix; * } * }; * * __global__ void ExampleKernel(int *d_data, int num_items, ...) * { * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread * typedef cub::BlockLoad BlockLoad; * typedef cub::BlockStore BlockStore; * typedef cub::BlockScan BlockScan; * * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan * __shared__ union { * typename BlockLoad::TempStorage load; * typename BlockScan::TempStorage scan; * typename BlockStore::TempStorage store; * } temp_storage; * * // Initialize running total * BlockPrefixCallbackOp prefix_op(0); * * // Have the block iterate over segments of items * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) * { * // Load a segment of consecutive items that are blocked across threads * int thread_data[4]; * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); * CTA_SYNC(); * * // Collectively compute the block-wide inclusive prefix max scan * BlockScan(temp_storage.scan).InclusiveScan( * thread_data, thread_data, cub::Max(), prefix_op); * CTA_SYNC(); * * // Store scanned items to output segment * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } * \endcode * \par * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) */ template < int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void InclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } } //@} end member group }; /** * \example example_block_scan.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/block/block_shuffle.cuh000066400000000000000000000267171434614775400174710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. */ #pragma once #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. * \ingroup BlockModule * * \tparam T The data type to be exchanged. * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * It is commonplace for blocks of threads to rearrange data items between * threads. The BlockShuffle abstraction allows threads to efficiently shift items * either (a) up to their successor or (b) down to their predecessor. * */ template < typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockShuffle { private: /****************************************************************************** * Constants ******************************************************************************/ enum { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, }; /****************************************************************************** * Type definitions ******************************************************************************/ /// Shared memory storage layout type (last element from each thread's input) typedef T _TempStorage[BLOCK_THREADS]; public: /// \smemstorage{BlockShuffle} struct TempStorage : Uninitialized<_TempStorage> {}; private: /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockShuffle() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockShuffle( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Shuffle movement *********************************************************************/ //@{ /** * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. * * \par * - \smemreuse */ __device__ __forceinline__ void Offset( T input, ///< [in] The input item from the calling thread (threadi) T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 int distance = 1) ///< [in] Offset distance (may be negative) { temp_storage[linear_tid] = input; CTA_SYNC(); const int offset_tid = static_cast(linear_tid) + distance; if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS)) { output = temp_storage[static_cast(offset_tid)]; } } /** * \brief Each threadi obtains the \p input provided by threadi+distance. * * \par * - \smemreuse */ __device__ __forceinline__ void Rotate( T input, ///< [in] The calling thread's input item T& output, ///< [out] The \p input item from thread thread(i+distance>)%BLOCK_THREADS (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) { temp_storage[linear_tid] = input; CTA_SYNC(); unsigned int offset = threadIdx.x + distance; if (offset >= BLOCK_THREADS) offset -= BLOCK_THREADS; output = temp_storage[offset]; } /** * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item * * \par * - \blocked * - \granularity * - \smemreuse */ template __device__ __forceinline__ void Up( T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. { temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); #pragma unroll for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) prev[ITEM] = input[ITEM - 1]; if (linear_tid > 0) prev[0] = temp_storage[linear_tid - 1]; } /** * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. * * \par * - \blocked * - \granularity * - \smemreuse */ template __device__ __forceinline__ void Up( T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads { Up(input, prev); block_suffix = temp_storage[BLOCK_THREADS - 1]; } /** * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item * * \par * - \blocked * - \granularity * - \smemreuse */ template __device__ __forceinline__ void Down( T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. { temp_storage[linear_tid] = input[0]; CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++) prev[ITEM] = input[ITEM + 1]; if (linear_tid < BLOCK_THREADS - 1) prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1]; } /** * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. * * \par * - \blocked * - \granularity * - \smemreuse */ template __device__ __forceinline__ void Down( T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads { Down(input, prev); block_prefix = temp_storage[0]; } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/block/block_store.cuh000066400000000000000000001263251434614775400171650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Operations for writing linear segments of data from the CUDA thread block */ #pragma once #include #include #include "block_exchange.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIo * @{ */ /******************************************************************//** * \name Blocked arrangement I/O (direct) *********************************************************************/ //@{ /** * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. * * \blocked * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, typename OutputIteratorT> __device__ __forceinline__ void StoreDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { thread_itr[ITEM] = items[ITEM]; } } /** * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range * * \blocked * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, typename OutputIteratorT> __device__ __forceinline__ void StoreDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) { thread_itr[ITEM] = items[ITEM]; } } } /** * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. * * \blocked * * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, * which is the default starting offset returned by \p cudaMalloc() * * \par * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: * - \p ITEMS_PER_THREAD is odd * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * */ template < typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void StoreDirectBlockedVectorized( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) T *block_ptr, ///< [in] Input pointer for storing from T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { enum { // Maximum CUDA vector size is 4 elements MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), // Vector size must be a power of two and an even divisor of the items per thread VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? MAX_VEC_SIZE : 1, VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, }; // Vector type typedef typename CubVector::Type Vector; // Alias global pointer Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) Vector raw_vector[VECTORS_PER_THREAD]; T *raw_items = reinterpret_cast(raw_vector); // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { raw_items[ITEM] = items[ITEM]; } // Direct-store using vector types StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); } //@} end member group /******************************************************************//** * \name Striped arrangement I/O (direct) *********************************************************************/ //@{ /** * \brief Store a striped arrangement of data across the thread block into a linear segment of items. * * \striped * * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT> __device__ __forceinline__ void StoreDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { OutputIteratorT thread_itr = block_itr + linear_tid; // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; } } /** * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range * * \striped * * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT> __device__ __forceinline__ void StoreDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { OutputIteratorT thread_itr = block_itr + linear_tid; // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) { thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; } } } //@} end member group /******************************************************************//** * \name Warp-striped arrangement I/O (direct) *********************************************************************/ //@{ /** * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. * * \warpstriped * * \par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, typename OutputIteratorT> __device__ __forceinline__ void StoreDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; OutputIteratorT thread_itr = block_itr + warp_offset + tid; // Store directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; } } /** * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range * * \warpstriped * * \par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, typename OutputIteratorT> __device__ __forceinline__ void StoreDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; OutputIteratorT thread_itr = block_itr + warp_offset + tid; // Store directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) { thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; } } } //@} end member group /** @} */ // end group UtilIo //----------------------------------------------------------------------------- // Generic BlockStore abstraction //----------------------------------------------------------------------------- /** * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. */ enum BlockStoreAlgorithm { /** * \par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. * * \par Performance Considerations * - The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ BLOCK_STORE_DIRECT, /** * \par Overview * A [striped arrangement](index.html#sec5sec3) of data is written * directly to memory. * * \par Performance Considerations * The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. */ BLOCK_STORE_STRIPED, /** * \par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written directly * to memory using CUDA's built-in vectorized stores as a coalescing optimization. * For example, st.global.v4.s32 instructions will be generated * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until the the * access stride between threads (i.e., the number items per thread) exceeds the * maximum vector store width (typically 4 items or 64B, whichever is lower). * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: * - \p ITEMS_PER_THREAD is odd * - The \p OutputIteratorT is not a simple pointer type * - The block output offset is not quadword-aligned * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) */ BLOCK_STORE_VECTORIZE, /** * \par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. * - The local reordering incurs slightly longer latencies and throughput than the * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. */ BLOCK_STORE_TRANSPOSE, /** * \par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a * [warp-striped arrangement](index.html#sec5sec3) * * \par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. * - The local reordering incurs slightly longer latencies and throughput than the * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. */ BLOCK_STORE_WARP_TRANSPOSE, /** * \par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a * [warp-striped arrangement](index.html#sec5sec3) * To reduce the shared memory requirement, only one warp's worth of shared * memory is provisioned and is subsequently time-sliced among warps. * * \par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. * - Provisions less shared memory temporary storage, but incurs larger * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. */ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, }; /** * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) * \ingroup BlockModule * \ingroup UtilIo * * \tparam T The type of data to be written. * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - The BlockStore class provides a single data movement abstraction that can be specialized * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different * performance policies for different architectures, data types, granularity sizes, etc. * - BlockStore can be optionally specialized by different data movement strategies: * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3) * of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory using CUDA's built-in vectorized stores as a * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED. A [blocked arrangement](index.html#sec5sec3) * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is * then written to memory. To reduce the shared memory requireent, only one warp's worth of shared * memory is provisioned and is subsequently time-sliced among warps. [More...](\ref cub::BlockStoreAlgorithm) * - \rowmajor * * \par A Simple Example * \blockcollective{BlockStore} * \par * The code snippet below illustrates the storing of a "blocked" arrangement * of 512 integers across 128 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, * meaning items are locally reordered among threads so that memory references will be * efficiently coalesced using a warp-striped access pattern. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockStore BlockStore; * * // Allocate shared memory for BlockStore * __shared__ typename BlockStore::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * BlockStore(temp_storage).Store(d_data, thread_data); * * \endcode * \par * Suppose the set of \p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockStore. */ template < typename T, int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockStore { private: /****************************************************************************** * Constants and typed definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /****************************************************************************** * Algorithmic variants ******************************************************************************/ /// Store helper template struct StoreInternal; /** * BLOCK_STORE_DIRECT specialization of store helper */ template struct StoreInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ StoreInternal( TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Store items into a linear segment of memory template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreDirectBlocked(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; /** * BLOCK_STORE_STRIPED specialization of store helper */ template struct StoreInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ StoreInternal( TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Store items into a linear segment of memory template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreDirectStriped(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; /** * BLOCK_STORE_VECTORIZE specialization of store helper */ template struct StoreInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ StoreInternal( TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) __device__ __forceinline__ void Store( T *block_ptr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreDirectBlocked(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; /** * BLOCK_STORE_TRANSPOSE specialization of store helper */ template struct StoreInternal { // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage { /// Temporary storage for partially-full block guard volatile int valid_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ StoreInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} /// Store items into a linear segment of memory template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockExchange(temp_storage).BlockedToStriped(items); StoreDirectStriped(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { BlockExchange(temp_storage).BlockedToStriped(items); if (linear_tid == 0) temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads CTA_SYNC(); StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; /** * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper */ template struct StoreInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage { /// Temporary storage for partially-full block guard volatile int valid_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ StoreInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} /// Store items into a linear segment of memory template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockExchange(temp_storage).BlockedToWarpStriped(items); StoreDirectWarpStriped(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { BlockExchange(temp_storage).BlockedToWarpStriped(items); if (linear_tid == 0) temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads CTA_SYNC(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; /** * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper */ template struct StoreInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage { /// Temporary storage for partially-full block guard volatile int valid_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; /// Constructor __device__ __forceinline__ StoreInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} /// Store items into a linear segment of memory template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockExchange(temp_storage).BlockedToWarpStriped(items); StoreDirectWarpStriped(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { BlockExchange(temp_storage).BlockedToWarpStriped(items); if (linear_tid == 0) temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads CTA_SYNC(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; /****************************************************************************** * Type definitions ******************************************************************************/ /// Internal load implementation to use typedef StoreInternal InternalStore; /// Shared memory storage layout type typedef typename InternalStore::TempStorage _TempStorage; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /****************************************************************************** * Thread fields ******************************************************************************/ /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; public: /// \smemstorage{BlockStore} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockStore() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockStore( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Data movement *********************************************************************/ //@{ /** * \brief Store items into a linear segment of memory. * * \par * - \blocked * - \smemreuse * * \par Snippet * The code snippet below illustrates the storing of a "blocked" arrangement * of 512 integers across 128 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, * meaning items are locally reordered among threads so that memory references will be * efficiently coalesced using a warp-striped access pattern. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockStore BlockStore; * * // Allocate shared memory for BlockStore * __shared__ typename BlockStore::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * int thread_data[4]; * BlockStore(temp_storage).Store(d_data, thread_data); * * \endcode * \par * Suppose the set of \p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... * */ template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [out] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } /** * \brief Store items into a linear segment of memory, guarded by range. * * \par * - \blocked * - \smemreuse * * \par Snippet * The code snippet below illustrates the guarded storing of a "blocked" arrangement * of 512 integers across 128 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, * meaning items are locally reordered among threads so that memory references will be * efficiently coalesced using a warp-striped access pattern. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) * { * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockStore BlockStore; * * // Allocate shared memory for BlockStore * __shared__ typename BlockStore::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * int thread_data[4]; * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); * * \endcode * \par * Suppose the set of \p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with * only the first two threads being unmasked to store portions of valid data. * */ template __device__ __forceinline__ void Store( OutputIteratorT block_itr, ///< [out] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } //@} end member group }; template > struct BlockStoreType { using type = cub::BlockStore; }; CUB_NAMESPACE_END cub-2.0.1/cub/block/radix_rank_sort_operations.cuh000066400000000000000000000131441434614775400223050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * radix_rank_sort_operations.cuh contains common abstractions, definitions and * operations used for radix sorting and ranking. */ #pragma once #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** \brief Twiddling keys for radix sort. */ template struct RadixSortTwiddle { typedef Traits TraitsT; typedef typename TraitsT::UnsignedBits UnsignedBits; static __host__ __device__ __forceinline__ UnsignedBits In(UnsignedBits key) { key = TraitsT::TwiddleIn(key); if (IS_DESCENDING) key = ~key; return key; } static __host__ __device__ __forceinline__ UnsignedBits Out(UnsignedBits key) { if (IS_DESCENDING) key = ~key; key = TraitsT::TwiddleOut(key); return key; } static __host__ __device__ __forceinline__ UnsignedBits DefaultKey() { return Out(~UnsignedBits(0)); } }; /** \brief Base struct for digit extractor. Contains common code to provide special handling for floating-point -0.0. \note This handles correctly both the case when the keys are bitwise-complemented after twiddling for descending sort (in onesweep) as well as when the keys are not bit-negated, but the implementation handles descending sort separately (in other implementations in CUB). Twiddling alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are subsequent bit patterns and bitwise complements of each other. For onesweep, both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending sort. For all other sorting implementations in CUB, both are always mapped to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other and only one of them is used, the sorting works correctly. For double, the same applies, but with 64-bit patterns. */ template struct BaseDigitExtractor { typedef Traits TraitsT; typedef typename TraitsT::UnsignedBits UnsignedBits; enum { FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT, }; static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key) { if (!FLOAT_KEY) { return key; } else { UnsignedBits TWIDDLED_MINUS_ZERO_BITS = TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1)); UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0); return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key; } } }; /** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a * key from a digit. */ template struct BFEDigitExtractor : BaseDigitExtractor { using typename BaseDigitExtractor::UnsignedBits; uint32_t bit_start, num_bits; explicit __device__ __forceinline__ BFEDigitExtractor( uint32_t bit_start = 0, uint32_t num_bits = 0) : bit_start(bit_start), num_bits(num_bits) { } __device__ __forceinline__ uint32_t Digit(UnsignedBits key) { return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits); } }; /** \brief A wrapper type to extract digits. Uses a combination of shift and * bitwise and to extract digits. */ template struct ShiftDigitExtractor : BaseDigitExtractor { using typename BaseDigitExtractor::UnsignedBits; uint32_t bit_start, mask; explicit __device__ __forceinline__ ShiftDigitExtractor( uint32_t bit_start = 0, uint32_t num_bits = 0) : bit_start(bit_start), mask((1 << num_bits) - 1) { } __device__ __forceinline__ uint32_t Digit(UnsignedBits key) { return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask; } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/000077500000000000000000000000001434614775400173465ustar00rootroot00000000000000cub-2.0.1/cub/block/specializations/block_histogram_atomic.cuh000066400000000000000000000061221434614775400245530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ #pragma once #include "../../config.cuh" CUB_NAMESPACE_BEGIN /** * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ template struct BlockHistogramAtomic { /// Shared memory storage layout type struct TempStorage {}; /// Constructor __device__ __forceinline__ BlockHistogramAtomic( TempStorage &temp_storage) {} /// Composite data onto an existing histogram template < typename T, typename CounterT, int ITEMS_PER_THREAD> __device__ __forceinline__ void Composite( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram { // Update histogram #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { atomicAdd(histogram + items[i], 1); } } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/block_histogram_sort.cuh000066400000000000000000000200451434614775400242660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ #pragma once #include "../../block/block_radix_sort.cuh" #include "../../block/block_discontinuity.cuh" #include "../../config.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN /** * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ template < typename T, ///< Sample type int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int ITEMS_PER_THREAD, ///< The number of samples per thread int BINS, ///< The number of bins into which histogram samples may fall int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective (unused) struct BlockHistogramSort { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; // Parameterize BlockRadixSort type for our thread block typedef BlockRadixSort< T, BLOCK_DIM_X, ITEMS_PER_THREAD, NullType, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockRadixSortT; // Parameterize BlockDiscontinuity type for our thread block typedef BlockDiscontinuity< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockDiscontinuityT; /// Shared memory union _TempStorage { // Storage for sorting bin values typename BlockRadixSortT::TempStorage sort; struct Discontinuities { // Storage for detecting discontinuities in the tile of sorted bin values typename BlockDiscontinuityT::TempStorage flag; // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values unsigned int run_begin[BINS]; unsigned int run_end[BINS]; } discontinuities; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage &temp_storage; unsigned int linear_tid; /// Constructor __device__ __forceinline__ BlockHistogramSort( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} // Discontinuity functor struct DiscontinuityOp { // Reference to temp_storage _TempStorage &temp_storage; // Constructor __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : temp_storage(temp_storage) {} // Discontinuity predicate __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) { if (a != b) { // Note the begin/end offsets in shared storage temp_storage.discontinuities.run_begin[b] = b_index; temp_storage.discontinuities.run_end[a] = b_index; return true; } else { return false; } } }; // Composite data onto an existing histogram template < typename CounterT > __device__ __forceinline__ void Composite( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Sort bytes in blocked arrangement BlockRadixSortT(temp_storage.sort).Sort(items); CTA_SYNC(); // Initialize the shared memory's run_begin and run_end for each bin int histo_offset = 0; #pragma unroll for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE; temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE; } // Finish up with guarded initialization if necessary if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE; temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE; } CTA_SYNC(); int flags[ITEMS_PER_THREAD]; // unused // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile DiscontinuityOp flag_op(temp_storage); BlockDiscontinuityT(temp_storage.discontinuities.flag).FlagHeads(flags, items, flag_op); // Update begin for first item if (linear_tid == 0) temp_storage.discontinuities.run_begin[items[0]] = 0; CTA_SYNC(); // Composite into histogram histo_offset = 0; #pragma unroll for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { int thread_offset = histo_offset + linear_tid; CounterT count = temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset]; histogram[thread_offset] += count; } // Finish up with guarded composition if necessary if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { int thread_offset = histo_offset + linear_tid; CounterT count = temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset]; histogram[thread_offset] += count; } } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/block_reduce_raking.cuh000066400000000000000000000225731434614775400240340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. */ #pragma once #include "../../block/block_raking_layout.cuh" #include "../../warp/warp_reduce.cuh" #include "../../thread/thread_reduce.cuh" #include "../../config.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN /** * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. * * Supports non-commutative binary reduction operators. Unlike commutative * reduction operators (e.g., addition), the application of a non-commutative * reduction operator (e.g, string concatenation) across a sequence of inputs must * honor the relative ordering of items and partial reductions when applying the * reduction operator. * * Compared to the implementation of BlockReduceRaking (which does not support * non-commutative operators), this implementation requires a few extra * rounds of inter-thread communication. */ template < typename T, ///< Data type being reduced int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockReduceRaking { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// WarpReduce utility type typedef typename WarpReduce::InternalWarpReduce WarpReduce; /// Constants enum { /// Number of raking threads RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, /// Cooperative work can be entirely warp synchronous WARP_SYNCHRONOUS = (int(RAKING_THREADS) == int(BLOCK_THREADS)), /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, /// Whether or not accesses into smem are unguarded RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, }; /// Shared memory storage layout type union _TempStorage { typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage &temp_storage; unsigned int linear_tid; /// Constructor __device__ __forceinline__ BlockReduceRaking( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} template __device__ __forceinline__ T RakingReduction( ReductionOp reduction_op, ///< [in] Binary scan operator T *raking_segment, T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) Int2Type /*iteration*/) { // Update partial if addend is in range if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) { T addend = raking_segment[ITERATION]; partial = reduction_op(partial, addend); } return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); } template __device__ __forceinline__ T RakingReduction( ReductionOp /*reduction_op*/, ///< [in] Binary scan operator T * /*raking_segment*/, T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) Int2Type /*iteration*/) { return partial; } /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template < bool IS_FULL_TILE, typename ReductionOp> __device__ __forceinline__ T Reduce( T partial, ///< [in] Calling thread's input partial reductions int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) ReductionOp reduction_op) ///< [in] Binary reduction operator { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) partial = WarpReduce(temp_storage.warp_storage).template Reduce( partial, num_valid, reduction_op); } else { // Place partial into shared memory grid. *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); partial = raking_segment[0]; partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); int valid_raking_threads = (IS_FULL_TILE) ? RAKING_THREADS : (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH; partial = WarpReduce(temp_storage.warp_storage).template Reduce( partial, valid_raking_threads, reduction_op); } } return partial; } /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template __device__ __forceinline__ T Sum( T partial, ///< [in] Calling thread's input partial reductions int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) { cub::Sum reduction_op; return Reduce(partial, num_valid, reduction_op); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/block_reduce_raking_commutative_only.cuh000066400000000000000000000202161434614775400275020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. */ #pragma once #include "block_reduce_raking.cuh" #include "../../warp/warp_reduce.cuh" #include "../../thread/thread_reduce.cuh" #include "../../config.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN /** * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. */ template < typename T, ///< Data type being reduced int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockReduceRakingCommutativeOnly { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values typedef BlockReduceRaking FallBack; /// Constants enum { /// Number of warp threads WARP_THREADS = CUB_WARP_THREADS(0), /// Whether or not to use fall-back USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), /// Number of raking threads RAKING_THREADS = WARP_THREADS, /// Number of threads actually sharing items with the raking threads SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, }; /// WarpReduce utility type typedef WarpReduce WarpReduce; /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// Shared memory storage layout type union _TempStorage { struct DefaultStorage { typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid } default_storage; typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage &temp_storage; unsigned int linear_tid; /// Constructor __device__ __forceinline__ BlockReduceRakingCommutativeOnly( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template __device__ __forceinline__ T Sum( T partial, ///< [in] Calling thread's input partial reductions int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) { if (USE_FALLBACK || !FULL_TILE) { return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); } else { // Place partial into shared memory grid if (linear_tid >= RAKING_THREADS) *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid); partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); // Warpscan partial = WarpReduce(temp_storage.default_storage.warp_storage).Sum(partial); } } return partial; } /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template < bool FULL_TILE, typename ReductionOp> __device__ __forceinline__ T Reduce( T partial, ///< [in] Calling thread's input partial reductions int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) ReductionOp reduction_op) ///< [in] Binary reduction operator { if (USE_FALLBACK || !FULL_TILE) { return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); } else { // Place partial into shared memory grid if (linear_tid >= RAKING_THREADS) *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid); partial = internal::ThreadReduce(raking_segment, reduction_op, partial); // Warpscan partial = WarpReduce(temp_storage.default_storage.warp_storage).Reduce(partial, reduction_op); } } return partial; } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/block_reduce_warp_reductions.cuh000066400000000000000000000227411434614775400257660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. */ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. */ template < typename T, ///< Data type being reduced int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockReduceWarpReductions { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, /// Number of warp threads WARP_THREADS = CUB_WARP_THREADS(0), /// Number of active warps WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, /// The logical warp size for warp reductions LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), /// Whether or not the logical warp size evenly divides the thread block size EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) }; /// WarpReduce utility type typedef typename WarpReduce::InternalWarpReduce WarpReduce; /// Shared memory storage layout type struct _TempStorage { typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan T block_prefix; ///< Shared prefix for the entire thread block }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage &temp_storage; int linear_tid; int warp_id; int lane_id; /// Constructor __device__ __forceinline__ BlockReduceWarpReductions( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), lane_id(LaneId()) {} template __device__ __forceinline__ T ApplyWarpAggregates( ReductionOp reduction_op, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) Int2Type /*successor_warp*/) { if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) { T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; warp_aggregate = reduction_op(warp_aggregate, addend); } return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); } template __device__ __forceinline__ T ApplyWarpAggregates( ReductionOp /*reduction_op*/, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) Int2Type /*successor_warp*/) { return warp_aggregate; } /// Returns block-wide aggregate in thread0. template < bool FULL_TILE, typename ReductionOp> __device__ __forceinline__ T ApplyWarpAggregates( ReductionOp reduction_op, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) { // Share lane aggregates if (lane_id == 0) { detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id, warp_aggregate); } CTA_SYNC(); // Update total aggregate in warp 0, lane 0 if (linear_tid == 0) { warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); } return warp_aggregate; } /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template __device__ __forceinline__ T Sum( T input, ///< [in] Calling thread's input partial reductions int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) { cub::Sum reduction_op; int warp_offset = (warp_id * LOGICAL_WARP_SIZE); int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? LOGICAL_WARP_SIZE : num_valid - warp_offset; // Warp reduction in every warp T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( input, warp_num_valid, cub::Sum()); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template < bool FULL_TILE, typename ReductionOp> __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input partial reductions int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) ReductionOp reduction_op) ///< [in] Binary reduction operator { int warp_offset = warp_id * LOGICAL_WARP_SIZE; int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? LOGICAL_WARP_SIZE : num_valid - warp_offset; // Warp reduction in every warp T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( input, warp_num_valid, reduction_op); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/block_scan_raking.cuh000066400000000000000000000672051434614775400235120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. */ #pragma once #include "../../config.cuh" #include "../../util_ptx.cuh" #include "../../block/block_raking_layout.cuh" #include "../../thread/thread_reduce.cuh" #include "../../thread/thread_scan.cuh" #include "../../warp/warp_scan.cuh" CUB_NAMESPACE_BEGIN /** * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. */ template < typename T, ///< Data type being scanned int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockScanRaking { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// Constants enum { /// Number of raking threads RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, /// Cooperative work can be entirely warp synchronous WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS)), }; /// WarpScan utility type typedef WarpScan WarpScan; /// Shared memory storage layout type struct _TempStorage { typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid T block_aggregate; ///< Block aggregate }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- // Thread fields _TempStorage &temp_storage; unsigned int linear_tid; T cached_segment[SEGMENT_LENGTH]; //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- /// Templated reduction template __device__ __forceinline__ T GuardedReduce( T* raking_ptr, ///< [in] Input array ScanOp scan_op, ///< [in] Binary reduction operator T raking_partial, ///< [in] Prefix to seed reduction with Int2Type /*iteration*/) { if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) { T addend = raking_ptr[ITERATION]; raking_partial = scan_op(raking_partial, addend); } return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); } /// Templated reduction (base case) template __device__ __forceinline__ T GuardedReduce( T* /*raking_ptr*/, ///< [in] Input array ScanOp /*scan_op*/, ///< [in] Binary reduction operator T raking_partial, ///< [in] Prefix to seed reduction with Int2Type /*iteration*/) { return raking_partial; } /// Templated copy template __device__ __forceinline__ void CopySegment( T* out, ///< [out] Out array T* in, ///< [in] Input array Int2Type /*iteration*/) { out[ITERATION] = in[ITERATION]; CopySegment(out, in, Int2Type()); } /// Templated copy (base case) __device__ __forceinline__ void CopySegment( T* /*out*/, ///< [out] Out array T* /*in*/, ///< [in] Input array Int2Type /*iteration*/) {} /// Performs upsweep raking reduction, returning the aggregate template __device__ __forceinline__ T Upsweep( ScanOp scan_op) { T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); // Read data into registers CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); T raking_partial = cached_segment[0]; return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); } /// Performs exclusive downsweep raking scan template __device__ __forceinline__ void ExclusiveDownsweep( ScanOp scan_op, T raking_partial, bool apply_prefix = true) { T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); // Read data back into registers if (!MEMOIZE) { CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); } internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); // Write data back to smem CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); } /// Performs inclusive downsweep raking scan template __device__ __forceinline__ void InclusiveDownsweep( ScanOp scan_op, T raking_partial, bool apply_prefix = true) { T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); // Read data back into registers if (!MEMOIZE) { CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); } internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); // Write data back to smem CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); } //--------------------------------------------------------------------- // Constructors //--------------------------------------------------------------------- /// Constructor __device__ __forceinline__ BlockScanRaking( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //--------------------------------------------------------------------- // Exclusive scans //--------------------------------------------------------------------- /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan operator { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } CTA_SYNC(); // Grab thread prefix from shared memory exclusive_output = *placement_ptr; } } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input items T &output, ///< [out] Calling thread's output items (may be aliased to \p input) const T &initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op) ///< [in] Binary scan operator { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Exclusive Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial); } CTA_SYNC(); // Grab exclusive partial from shared memory output = *placement_ptr; } } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial= Upsweep(scan_op); // Warp-synchronous scan T inclusive_partial; T exclusive_partial; WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); // Broadcast aggregate to all threads if (linear_tid == RAKING_THREADS - 1) temp_storage.block_aggregate = inclusive_partial; } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; // Retrieve block aggregate block_aggregate = temp_storage.block_aggregate; } } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input items T &output, ///< [out] Calling thread's output items (may be aliased to \p input) const T &initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial); // Broadcast aggregate to other threads if (linear_tid == 0) temp_storage.block_aggregate = block_aggregate; } CTA_SYNC(); // Grab exclusive partial from shared memory output = *placement_ptr; // Retrieve block aggregate block_aggregate = temp_storage.block_aggregate; } } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan T block_aggregate; WarpScan warp_scan(temp_storage.warp_scan); warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); // Obtain warp-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); output = scan_op(block_prefix, output); if (linear_tid == 0) output = block_prefix; } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { WarpScan warp_scan(temp_storage.warp_scan); // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial, block_aggregate; warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); // Obtain block-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); // Update prefix with warpscan exclusive partial T downsweep_prefix = scan_op(block_prefix, exclusive_partial); if (linear_tid == 0) downsweep_prefix = block_prefix; // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, downsweep_prefix); } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; } } //--------------------------------------------------------------------- // Inclusive scans //--------------------------------------------------------------------- /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan operator { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Exclusive Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); // Inclusive raking downsweep scan InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; } } /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T inclusive_partial; T exclusive_partial; WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); // Inclusive raking downsweep scan InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); // Broadcast aggregate to all threads if (linear_tid == RAKING_THREADS - 1) temp_storage.block_aggregate = inclusive_partial; } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; // Retrieve block aggregate block_aggregate = temp_storage.block_aggregate; } } /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan T block_aggregate; WarpScan warp_scan(temp_storage.warp_scan); warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); // Obtain warp-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); // Update prefix with exclusive warpscan partial output = scan_op(block_prefix, output); } else { // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { WarpScan warp_scan(temp_storage.warp_scan); // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial, block_aggregate; warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); // Obtain block-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); // Update prefix with warpscan exclusive partial T downsweep_prefix = scan_op(block_prefix, exclusive_partial); if (linear_tid == 0) downsweep_prefix = block_prefix; // Inclusive raking downsweep scan InclusiveDownsweep(scan_op, downsweep_prefix); } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; } } }; CUB_NAMESPACE_END cub-2.0.1/cub/block/specializations/block_scan_warp_scans.cuh000066400000000000000000000453471434614775400244020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. */ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. */ template < typename T, int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockScanWarpScans { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// Constants enum { /// Number of warp threads WARP_THREADS = CUB_WARP_THREADS(0), /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, /// Number of active warps WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, }; /// WarpScan utility type typedef WarpScan WarpScanT; /// WarpScan utility type typedef WarpScan WarpAggregateScan; /// Shared memory storage layout type struct __align__(32) _TempStorage { T warp_aggregates[WARPS]; typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans T block_prefix; ///< Shared prefix for the entire thread block }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- // Thread fields _TempStorage &temp_storage; unsigned int linear_tid; unsigned int warp_id; unsigned int lane_id; //--------------------------------------------------------------------- // Constructors //--------------------------------------------------------------------- /// Constructor __device__ __forceinline__ BlockScanWarpScans( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), lane_id(LaneId()) {} //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- template __device__ __forceinline__ void ApplyWarpAggregates( T &warp_prefix, ///< [out] The calling thread's partial reduction ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items Int2Type /*addend_warp*/) { if (warp_id == WARP) warp_prefix = block_aggregate; T addend = temp_storage.warp_aggregates[WARP]; block_aggregate = scan_op(block_aggregate, addend); ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); } template __device__ __forceinline__ void ApplyWarpAggregates( T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction ScanOp /*scan_op*/, ///< [in] Binary scan operator T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items Int2Type /*addend_warp*/) {} /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. template __device__ __forceinline__ T ComputeWarpPrefix( ScanOp scan_op, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) { detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id, warp_aggregate); } CTA_SYNC(); // Accumulate block aggregates and save the one that is our warp's prefix T warp_prefix; block_aggregate = temp_storage.warp_aggregates[0]; // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); /* #pragma unroll for (int WARP = 1; WARP < WARPS; ++WARP) { if (warp_id == WARP) warp_prefix = block_aggregate; T addend = temp_storage.warp_aggregates[WARP]; block_aggregate = scan_op(block_aggregate, addend); } */ return warp_prefix; } /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. template __device__ __forceinline__ T ComputeWarpPrefix( ScanOp scan_op, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items const T &initial_value) ///< [in] Initial value to seed the exclusive scan { T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); warp_prefix = scan_op(initial_value, warp_prefix); if (warp_id == 0) warp_prefix = initial_value; return warp_prefix; } //--------------------------------------------------------------------- // Exclusive scans //--------------------------------------------------------------------- /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan operator { // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. T block_aggregate; ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input items T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) const T &initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op) ///< [in] Binary scan operator { T block_aggregate; ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); // Apply warp prefix to our lane's partial if (warp_id != 0) { exclusive_output = scan_op(warp_prefix, exclusive_output); if (lane_id == 0) exclusive_output = warp_prefix; } } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input items T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) const T &initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); // Compute the warp-wide prefix and block-wide aggregate for each warp T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); // Apply warp prefix to our lane's partial exclusive_output = scan_op(warp_prefix, exclusive_output); if (lane_id == 0) exclusive_output = warp_prefix; } /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. T block_aggregate; ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); // Use the first warp to determine the thread block prefix, returning the result in lane0 if (warp_id == 0) { T block_prefix = block_prefix_callback_op(block_aggregate); if (lane_id == 0) { // Share the prefix with all threads detail::uninitialized_copy(&temp_storage.block_prefix, block_prefix); exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 } } CTA_SYNC(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; if (linear_tid > 0) { exclusive_output = scan_op(block_prefix, exclusive_output); } } //--------------------------------------------------------------------- // Inclusive scans //--------------------------------------------------------------------- /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan operator { T block_aggregate; InclusiveScan(input, inclusive_output, scan_op, block_aggregate); } /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); // Apply warp prefix to our lane's partial if (warp_id != 0) { inclusive_output = scan_op(warp_prefix, inclusive_output); } } /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { T block_aggregate; InclusiveScan(input, exclusive_output, scan_op, block_aggregate); // Use the first warp to determine the thread block prefix, returning the result in lane0 if (warp_id == 0) { T block_prefix = block_prefix_callback_op(block_aggregate); if (lane_id == 0) { // Share the prefix with all threads detail::uninitialized_copy(&temp_storage.block_prefix, block_prefix); } } CTA_SYNC(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; exclusive_output = scan_op(block_prefix, exclusive_output); } }; CUB_NAMESPACE_END cub-2.0.1/cub/cmake/000077500000000000000000000000001434614775400141335ustar00rootroot00000000000000cub-2.0.1/cub/cmake/cub-config-version.cmake000066400000000000000000000025231434614775400206360ustar00rootroot00000000000000# Parse version information from version.cuh: include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake") file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER) string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") set(CUB_VERSION_FLAT ${CMAKE_MATCH_1}) # Note that CUB calls this the PATCH number, CMake calls it the TWEAK number: string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1}) math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000") math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000") math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch" set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}") set(PACKAGE_VERSION ${CUB_VERSION}) set(PACKAGE_VERSION_COMPATIBLE FALSE) set(PACKAGE_VERSION_EXACT FALSE) set(PACKAGE_VERSION_UNSUITABLE FALSE) if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION) if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR) set(PACKAGE_VERSION_COMPATIBLE TRUE) endif() if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) set(PACKAGE_VERSION_EXACT TRUE) endif() endif() cub-2.0.1/cub/cmake/cub-config.cmake000066400000000000000000000111211434614775400171450ustar00rootroot00000000000000# # find_package(CUB) config file. # # Defines a CUB::CUB target that may be linked from user projects to include # CUB. if (TARGET CUB::CUB) return() endif() # Minimum supported libcudacxx version: set(cub_libcudacxx_version 1.8.0) function(_cub_declare_interface_alias alias_name ugly_name) # 1) Only IMPORTED and ALIAS targets can be placed in a namespace. # 2) When an IMPORTED library is linked to another target, its include # directories are treated as SYSTEM includes. # 3) nvcc will automatically check the CUDA Toolkit include path *before* the # system includes. This means that the Toolkit CUB will *always* be used # during compilation, and the include paths of an IMPORTED CUB::CUB # target will never have any effect. # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED # on EVERY target that links to CUB::CUB. This would be a burden and a # footgun for our users. Forgetting this would silently pull in the wrong CUB! # 5) A workaround is to make a non-IMPORTED library outside of the namespace, # configure it, and then ALIAS it into the namespace (or ALIAS and then # configure, that seems to work too). add_library(${ugly_name} INTERFACE) add_library(${alias_name} ALIAS ${ugly_name}) endfunction() # # Setup some internal cache variables # # Pull in the include dir detected by cub-config-version.cmake set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE INTERNAL "Location of CUB headers." FORCE ) unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE) else() set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE) endif() # # Setup dependencies # if (NOT TARGET CUB::libcudacxx) if (TARGET Thrust::libcudacxx) # Prefer the same libcudacxx as Thrust, if available: _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx) else() if (NOT TARGET libcudacxx::libcudacxx) # First do a non-required search for any co-packaged versions. # These are preferred. find_package(libcudacxx ${cub_libcudacxx_version} CONFIG ${_CUB_QUIET_FLAG} NO_DEFAULT_PATH # Only check the explicit HINTS below: HINTS "${_CUB_INCLUDE_DIR}/../libcudacxx" # Source layout "${_CUB_CMAKE_DIR}/.." # Install layout ) # A second required search allows externally packaged to be used and fails if # no suitable package exists. find_package(libcudacxx ${cub_libcudacxx_version} CONFIG REQUIRED ${_CUB_QUIET_FLAG} ) endif() _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx) endif() endif() # # Setup targets # _cub_declare_interface_alias(CUB::CUB _CUB_CUB) target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}") target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx) if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API") endif() if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR THRUST_IGNORE_DEPRECATED_CPP_DIALECT) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT") endif() if (CUB_IGNORE_DEPRECATED_CPP_11 OR THRUST_IGNORE_DEPRECATED_CPP_11) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11") endif() if (CUB_IGNORE_DEPRECATED_COMPILER OR THRUST_IGNORE_DEPRECATED_COMPILER) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER") endif() # # Standardize version info # set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE) set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE) set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE) set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE) set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE) set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE) include(FindPackageHandleStandardArgs) if (NOT CUB_CONFIG) set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}") endif() find_package_handle_standard_args(CUB CONFIG_MODE) cub-2.0.1/cub/cmake/cub-header-search.cmake000066400000000000000000000005521434614775400204010ustar00rootroot00000000000000# Parse version information from version.h in source tree set(_CUB_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..") if(EXISTS "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh") set(_CUB_VERSION_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) endif() cub-2.0.1/cub/cmake/cub-header-search.cmake.in000066400000000000000000000014231434614775400210040ustar00rootroot00000000000000# Parse version information from version.h: unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search # Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory" set(from_install_prefix "@install_location@") # Transform to a list of directories, replace each directoy with "../" # and convert back to a string string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}") list(TRANSFORM from_install_prefix REPLACE ".+" "../") list(JOIN from_install_prefix "" from_install_prefix) find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh NO_DEFAULT_PATH # Only search explicit paths below: PATHS "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@" ) set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) cub-2.0.1/cub/config.cuh000066400000000000000000000037201434614775400150230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Static configuration header for the CUB project. */ #pragma once #include "util_arch.cuh" #include "util_compiler.cuh" #include "util_cpp_dialect.cuh" #include "util_deprecated.cuh" #include "util_macro.cuh" #include "util_namespace.cuh" cub-2.0.1/cub/cub.cuh000066400000000000000000000100031434614775400143170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * CUB umbrella include file */ #pragma once // Static configuration #include "config.cuh" // Block #include "block/block_histogram.cuh" #include "block/block_adjacent_difference.cuh" #include "block/block_discontinuity.cuh" #include "block/block_exchange.cuh" #include "block/block_load.cuh" #include "block/block_radix_rank.cuh" #include "block/block_radix_sort.cuh" #include "block/block_merge_sort.cuh" #include "block/block_reduce.cuh" #include "block/block_scan.cuh" #include "block/block_store.cuh" //#include "block/block_shift.cuh" // Device #include "device/device_merge_sort.cuh" #include "device/device_histogram.cuh" #include "device/device_partition.cuh" #include "device/device_radix_sort.cuh" #include "device/device_reduce.cuh" #include "device/device_run_length_encode.cuh" #include "device/device_scan.cuh" #include "device/device_segmented_sort.cuh" #include "device/device_segmented_radix_sort.cuh" #include "device/device_segmented_reduce.cuh" #include "device/device_select.cuh" #include "device/device_spmv.cuh" #include "device/device_adjacent_difference.cuh" // Grid //#include "grid/grid_barrier.cuh" #include "grid/grid_even_share.cuh" #include "grid/grid_mapping.cuh" #include "grid/grid_queue.cuh" // Thread #include "thread/thread_load.cuh" #include "thread/thread_operators.cuh" #include "thread/thread_reduce.cuh" #include "thread/thread_scan.cuh" #include "thread/thread_store.cuh" // Warp #include "warp/warp_exchange.cuh" #include "warp/warp_load.cuh" #include "warp/warp_merge_sort.cuh" #include "warp/warp_reduce.cuh" #include "warp/warp_scan.cuh" #include "warp/warp_store.cuh" // Iterator #include "iterator/arg_index_input_iterator.cuh" #include "iterator/cache_modified_input_iterator.cuh" #include "iterator/cache_modified_output_iterator.cuh" #include "iterator/constant_input_iterator.cuh" #include "iterator/counting_input_iterator.cuh" #include "iterator/discard_output_iterator.cuh" #include "iterator/tex_obj_input_iterator.cuh" #include "iterator/tex_ref_input_iterator.cuh" #include "iterator/transform_input_iterator.cuh" // Util #include "util_allocator.cuh" #include "util_arch.cuh" #include "util_debug.cuh" #include "util_device.cuh" #include "util_macro.cuh" #include "util_ptx.cuh" #include "util_type.cuh" cub-2.0.1/cub/detail/000077500000000000000000000000001434614775400143155ustar00rootroot00000000000000cub-2.0.1/cub/detail/choose_offset.cuh000066400000000000000000000050251434614775400176460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and * selects the offset type based on it. */ template struct ChooseOffsetT { // NumItemsT must be an integral type (but not bool). static_assert( std::is_integral::value && !std::is_same::type, bool>::value, "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. using Type = typename std::conditional::type; }; } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/detail/cpp_compatibility.cuh000066400000000000000000000015521434614775400205340ustar00rootroot00000000000000/* * Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr # define CUB_IF_CONSTEXPR if constexpr # define CUB_ELSE_IF_CONSTEXPR else if constexpr #else # define CUB_IF_CONSTEXPR if # define CUB_ELSE_IF_CONSTEXPR else if #endif cub-2.0.1/cub/detail/detect_cuda_runtime.cuh000066400000000000000000000066241434614775400210350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Utilities for CUDA dynamic parallelism. */ #pragma once #include #include CUB_NAMESPACE_BEGIN namespace detail { #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: /** * \def CUB_RDC_ENABLED * * Defined if RDC is enabled. */ #define CUB_RDC_ENABLED /** * \def CUB_RUNTIME_FUNCTION * * Execution space for functions that can use the CUDA runtime API (`__host__` * when RDC is off, `__host__ __device__` when RDC is on). */ #define CUB_RUNTIME_FUNCTION /** * \def CUB_RUNTIME_ENABLED * * Whether or not the active compiler pass is allowed to invoke device kernels * or methods from the CUDA runtime API. * * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` * and is not compatible with `NV_IF_TARGET`. It is provided for legacy * purposes only. * * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. */ #define CUB_RUNTIME_ENABLED #else // Non-doxygen pass: #ifndef CUB_RUNTIME_FUNCTION #if defined(__CUDACC_RDC__) #define CUB_RDC_ENABLED #define CUB_RUNTIME_FUNCTION __host__ __device__ #else // RDC disabled: #define CUB_RUNTIME_FUNCTION __host__ #endif // RDC enabled #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) // Legacy only -- do not use in new code. #define CUB_RUNTIME_ENABLED #endif #endif // CUB_RUNTIME_FUNCTION predefined #ifdef CUB_RDC_ENABLED // Detect available version of CDP: #if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED) #define CUB_DETAIL_CDPv1 #else #define CUB_DETAIL_CDPv2 #endif #endif #endif // Do not document } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/detail/device_double_buffer.cuh000066400000000000000000000052121434614775400211400ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include CUB_NAMESPACE_BEGIN namespace detail { /** * @brief It's a double-buffer storage wrapper for multi-pass stream * transformations that require more than one storage array for * streaming intermediate results back and forth. * * Many multi-pass computations require a pair of "ping-pong" storage buffers * (e.g., one for reading from and the other for writing to, and then * vice-versa for the subsequent pass). This structure wraps a set of device * buffers. * * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member * to track which buffer is "current". The main reason for this class existence * is the performance difference. Since `cub::DoubleBuffer` relies on the * runtime variable to index pointers arrays, they are placed in the local * memory instead of registers. Local memory accesses significantly affect * performance. On the contrary, this class swaps pointer, so all operations * can be performed in registers. */ template class device_double_buffer { /// Pair of device buffer pointers T *m_current_buffer {}; T *m_alternate_buffer {}; public: /** * @param d_current * The currently valid buffer * * @param d_alternate * Alternate storage buffer of the same size as @p d_current */ __host__ __device__ __forceinline__ device_double_buffer(T *current, T *alternate) : m_current_buffer(current) , m_alternate_buffer(alternate) {} /// \brief Return pointer to the currently valid buffer __host__ __device__ __forceinline__ T *current() const { return m_current_buffer; } /// \brief Return pointer to the currently invalid buffer __host__ __device__ __forceinline__ T *alternate() const { return m_alternate_buffer; } __host__ __device__ void swap() { T *tmp = m_current_buffer; m_current_buffer = m_alternate_buffer; m_alternate_buffer = tmp; } }; } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/detail/device_synchronize.cuh000066400000000000000000000036061434614775400207150ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and * CUDA configuration. */ CUB_EXEC_CHECK_DISABLE CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() { cudaError_t result = cudaErrorNotSupported; // Device-side sync is only available under CDPv1: #if defined(CUB_DETAIL_CDPv1) #if ((__CUDACC_VER_MAJOR__ > 11) || \ ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) // CUDA >= 11.6 #define CUB_TMP_DEVICE_SYNC_IMPL \ result = __cudaDeviceSynchronizeDeprecationAvoidance(); #else // CUDA < 11.6: #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); #endif #else // CDPv2 or no CDP: #define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */ #endif // CDP version NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL)); #undef CUB_TMP_DEVICE_SYNC_IMPL return result; } } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/detail/exec_check_disable.cuh000066400000000000000000000022351434614775400205640ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include /** * @def CUB_EXEC_CHECK_DISABLE * Wrapper around `#pragma nv_exec_check_disable`. */ // #pragma nv_exec_check_disable is only recognized by NVCC. #if defined(__CUDACC__) && \ !defined(_NVHPC_CUDA) && \ !(defined(__CUDA__) && defined(__clang__)) #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC #define CUB_EXEC_CHECK_DISABLE __pragma("nv_exec_check_disable") #else // // !MSVC #define CUB_EXEC_CHECK_DISABLE _Pragma("nv_exec_check_disable") #endif // MSVC #else // !NVCC #define CUB_EXEC_CHECK_DISABLE #endif // NVCC cub-2.0.1/cub/detail/temporary_storage.cuh000066400000000000000000000206721434614775400205730ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace temporary_storage { class slot; template class alias; template class layout; /** * @brief Temporary storage slot that can be considered a C++ union with an * arbitrary fields count. * * @warning slot lifetime is defined by the lifetime of the associated layout. * It's impossible to request new array if layout is already mapped. * * @par A Simple Example * @code * auto slot = temporary_storage.get_slot(0); * * // Add fields into the slot * // Create an int alias with 0 elements: * auto int_array = slot->create_alias(); * // Create a double alias with 2 elements: * auto double_array = slot->create_alias(2); * // Create a char alias with 0 elements: * auto empty_array = slot->create_alias(); * // Slot size is defined by double_array size (2 * sizeof(double)) * * if (condition) * { * int_array.grow(42); * // Now slot size is defined by int_array size (42 * sizeof(int)) * } * * // Temporary storage mapping * // ... * int *d_int_array = int_array.get(); * double *d_double_array = double_array.get(); * char *d_empty_array = empty_array.get(); // Guaranteed to return nullptr * @endcode */ class slot { std::size_t m_size{}; void *m_pointer{}; public: slot() = default; /** * @brief Returns an array of type @p T and length @p elements */ template __host__ __device__ alias create_alias(std::size_t elements = 0); private: __host__ __device__ void set_bytes_required(std::size_t new_size) { m_size = (max)(m_size, new_size); } __host__ __device__ std::size_t get_bytes_required() const { return m_size; } __host__ __device__ void set_storage(void *ptr) { m_pointer = ptr; } __host__ __device__ void *get_storage() const { return m_pointer; } template friend class alias; template friend class layout; }; /** * @brief Named memory region of a temporary storage slot * * @par Overview * This class provides a typed wrapper of a temporary slot memory region. * It can be considered as a field in the C++ union. It's only possible to * increase the array size. * * @warning alias lifetime is defined by the lifetime of the associated slot * It's impossible to grow the array if the layout is already mapped. */ template class alias { slot &m_slot; std::size_t m_elements{}; __host__ __device__ explicit alias(slot &slot, std::size_t elements = 0) : m_slot(slot) , m_elements(elements) { this->update_slot(); } __host__ __device__ void update_slot() { m_slot.set_bytes_required(m_elements * sizeof(T)); } public: alias() = delete; /** * @brief Increases the number of elements * * @warning * This method should be called before temporary storage mapping stage. * * @param[in] new_elements Increases the memory region occupied in the * temporary slot to fit up to @p new_elements items * of type @p T. */ __host__ __device__ void grow(std::size_t new_elements) { m_elements = new_elements; this->update_slot(); } /** * @brief Returns pointer to array * * If the @p elements number is equal to zero, or storage layout isn't mapped, * @p nullptr is returned. */ __host__ __device__ T *get() const { if (m_elements == 0) { return nullptr; } return reinterpret_cast(m_slot.get_storage()); } friend class slot; }; template __host__ __device__ alias slot::create_alias(std::size_t elements) { return alias(*this, elements); } /** * @brief Temporary storage layout represents a structure with * @p SlotsCount union-like fields * * The layout can be mapped to a temporary buffer only once. * * @par A Simple Example * @code * cub::detail::temporary_storage::layout<3> temporary_storage; * * auto slot_1 = temporary_storage.get_slot(0); * auto slot_2 = temporary_storage.get_slot(1); * * // Add fields into the first slot * auto int_array = slot_1->create_alias(1); * auto double_array = slot_1->create_alias(2); * * // Add fields into the second slot * auto char_array = slot_2->create_alias(); * * // The equivalent C++ structure could look like * // struct StorageLayout * // { * // union { * // } slot_0; * // std::byte padding_0[256 - sizeof (slot_0)]; * // * // union { * // int alias_0[1]; * // double alias_1[2]; * // } slot_1; * // std::byte padding_1[256 - sizeof (slot_1)]; * // * // union { * // char alias_0[0]; * // } slot_2; * // std::byte padding_2[256 - sizeof (slot_2)]; * // }; * * // The third slot is empty * * // Temporary storage mapping * if (d_temp_storage == nullptr) * { * temp_storage_bytes = temporary_storage.get_size(); * return; * } * else * { * temporary_storage.map_to_buffer(d_temp_storage, temp_storage_bytes); * } * * // Use pointers * int *d_int_array = int_array.get(); * double *d_double_array = double_array.get(); * char *d_char_array = char_array.get(); * @endcode */ template class layout { slot m_slots[SlotsCount]; std::size_t m_sizes[SlotsCount]; void *m_pointers[SlotsCount]; bool m_layout_was_mapped {}; public: layout() = default; __host__ __device__ slot *get_slot(int slot_id) { if (slot_id < SlotsCount) { return &m_slots[slot_id]; } return nullptr; } /** * @brief Returns required temporary storage size in bytes */ __host__ __device__ std::size_t get_size() { this->prepare_interface(); // AliasTemporaries can return error only in mapping stage, // so it's safe to ignore it here. std::size_t temp_storage_bytes{}; AliasTemporaries(nullptr, temp_storage_bytes, m_pointers, m_sizes); if (temp_storage_bytes == 0) { // The current CUB convention implies that there are two stages for each // device-scope function call. The first one returns the required storage // size. The second stage consumes temporary storage to perform some work. // The only way to distinguish between the two stages is by checking the // value of the temporary storage pointer. If zero bytes are requested, // `cudaMalloc` will return `nullptr`. This fact makes it impossible to // distinguish between the two stages, so we request some fixed amount of // bytes (even if we don't need it) to have a non-null temporary storage // pointer. return 1; } return temp_storage_bytes; } /** * @brief Maps the layout to the temporary storage buffer. */ __host__ __device__ cudaError_t map_to_buffer(void *d_temp_storage, std::size_t temp_storage_bytes) { if (m_layout_was_mapped) { return cudaErrorAlreadyMapped; } this->prepare_interface(); cudaError_t error = cudaSuccess; if ((error = AliasTemporaries(d_temp_storage, temp_storage_bytes, m_pointers, m_sizes))) { return error; } for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++) { m_slots[slot_id].set_storage(m_pointers[slot_id]); } m_layout_was_mapped = true; return error; } private: __host__ __device__ void prepare_interface() { if (m_layout_was_mapped) { return; } for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++) { const std::size_t slot_size = m_slots[slot_id].get_bytes_required(); m_sizes[slot_id] = slot_size; m_pointers[slot_id] = nullptr; } } }; } // namespace temporary_storage } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/detail/type_traits.cuh000066400000000000000000000046651434614775400174000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Wrappers and extensions around utilities. */ #pragma once #include #include #include CUB_NAMESPACE_BEGIN namespace detail { template using invoke_result_t = #if CUB_CPP_DIALECT < 2017 typename ::cuda::std::result_of::type; #else // 2017+ ::cuda::std::invoke_result_t; #endif /// The type of intermediate accumulator (according to P2322R6) template using accumulator_t = typename ::cuda::std::decay>::type; } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/detail/uninitialized_copy.cuh000066400000000000000000000046741434614775400207330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include CUB_NAMESPACE_BEGIN namespace detail { template ::value, int >::type = 0> __host__ __device__ void uninitialized_copy(T *ptr, U &&val) { *ptr = ::cuda::std::forward(val); } template ::value, int >::type = 0> __host__ __device__ void uninitialized_copy(T *ptr, U &&val) { new (ptr) T(::cuda::std::forward(val)); } } // namespace detail CUB_NAMESPACE_END cub-2.0.1/cub/device/000077500000000000000000000000001434614775400143125ustar00rootroot00000000000000cub-2.0.1/cub/device/device_adjacent_difference.cuh000066400000000000000000000631431434614775400222640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceAdjacentDifference provides device-wide, parallel operations for * computing the differences of adjacent elements residing within * device-accessible memory. * * @ingroup SingleModule * * @par Overview * - DeviceAdjacentDifference calculates the differences of adjacent elements in * d_input. Because the binary operation could be noncommutative, there * are two sets of methods. Methods named SubtractLeft subtract left element * `*(i - 1)` of input sequence from current element `*i`. * Methods named `SubtractRight` subtract current element `*i` from the * right one `*(i + 1)`: * @par * @code * int *d_values; // [1, 2, 3, 4] * //... * int *d_subtract_left_result <-- [ 1, 1, 1, 1 ] * int *d_subtract_right_result <-- [ -1, -1, -1, 4 ] * @endcode * - For SubtractLeft, if the left element is out of bounds, the iterator is * assigned to \*(result + (i - first)) without modification. * - For SubtractRight, if the right element is out of bounds, the iterator is * assigned to \*(result + (i - first)) without modification. * * @par Snippet * The code snippet below illustrates how to use @p DeviceAdjacentDifference to * compute the left difference between adjacent elements. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * int num_items; // e.g., 8 * int *d_values; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] * //... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * * cub::DeviceAdjacentDifference::SubtractLeft( * d_temp_storage, temp_storage_bytes, d_values, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run operation * cub::DeviceAdjacentDifference::SubtractLeft( * d_temp_storage, temp_storage_bytes, d_values, num_items); * * // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1] * @endcode */ struct DeviceAdjacentDifference { private: template static CUB_RUNTIME_FUNCTION cudaError_t AdjacentDifference(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream) { using OffsetT = typename detail::ChooseOffsetT::Type; using DispatchT = DispatchAdjacentDifference; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast(num_items), difference_op, stream); } public: /** * @brief Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. * @ingroup SingleModule * * @par Overview * - Calculates the differences of adjacent elements in `d_input`. That is, * `*d_input` is assigned to `*d_output`, and, for each iterator `i` in the * range `[d_input + 1, d_input + num_items)`, the result of * `difference_op(*i, *(i - 1))` is assigned to * `*(d_output + (i - d_input))`. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates how to use @p DeviceAdjacentDifference * to compute the difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * int num_items; // e.g., 8 * int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] * int *d_output; * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * * cub::DeviceAdjacentDifference::SubtractLeftCopy( * d_temp_storage, temp_storage_bytes, * d_input, d_output, * num_items, CustomDifference()); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run operation * cub::DeviceAdjacentDifference::SubtractLeftCopy( * d_temp_storage, temp_storage_bytes, * d_input, d_output, * num_items, CustomDifference()); * * // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] * // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1] * @endcode * * @tparam InputIteratorT * is a model of Input Iterator, * and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then * `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to * a type in `OutputIteratorT`'s set of `value_types`, and the return type * of `x - y` is convertible to a type in `OutputIteratorT`'s set of * `value_types`. * * @tparam OutputIteratorT * is a model of Output Iterator. * * @tparam DifferenceOpT * Its `result_type` is convertible to a type in `OutputIteratorT`'s set of * `value_types`. * * @tparam NumItemsT **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input * Pointer to the input sequence * * @param[out] d_output * Pointer to the output sequence * * @param[in] num_items * Number of items in the input sequence * * @param[in] difference_op * The binary function used to compute differences * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0 */ template static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { constexpr bool may_alias = false; constexpr bool read_left = true; return AdjacentDifference(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } /** * @brief Subtracts the left element of each adjacent pair of elements * residing within device-accessible memory. * * @ingroup SingleModule * * @par Overview * Calculates the differences of adjacent elements in `d_input`. That is, for * each iterator `i` in the range `[d_input + 1, d_input + num_items)`, the * result of `difference_op(*i, *(i - 1))` is assigned to * `*(d_input + (i - d_input))`. * * @par Snippet * The code snippet below illustrates how to use @p DeviceAdjacentDifference * to compute the difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * int num_items; // e.g., 8 * int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceAdjacentDifference::SubtractLeft( * d_temp_storage, temp_storage_bytes, * d_data, num_items, CustomDifference()); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run operation * cub::DeviceAdjacentDifference::SubtractLeft( * d_temp_storage, temp_storage_bytes, * d_data, num_items, CustomDifference()); * * // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1] * @endcode * * @tparam RandomAccessIteratorT * is a model of Random Access Iterator, * `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of * `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the * return type of `x - y` should be convertible to a type in * `RandomAccessIteratorT`'s set of `value_types`. * * @tparam DifferenceOpT * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s * set of `value_types`. * * @tparam NumItemsT **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_input * Pointer to the input sequence and the result * * @param[in] num_items * Number of items in the input sequence * * @param[in] difference_op * The binary function used to compute differences * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(void *d_temp_storage, std::size_t &temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { constexpr bool may_alias = true; constexpr bool read_left = true; return AdjacentDifference(d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(void *d_temp_storage, std::size_t &temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } /** * @brief Subtracts the right element of each adjacent pair of elements * residing within device-accessible memory. * * @ingroup SingleModule * * @par Overview * - Calculates the right differences of adjacent elements in `d_input`. That * is, `*(d_input + num_items - 1)` is assigned to * `*(d_output + num_items - 1)`, and, for each iterator `i` in the range * `[d_input, d_input + num_items - 1)`, the result of * `difference_op(*i, *(i + 1))` is assigned to * `*(d_output + (i - d_input))`. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates how to use @p DeviceAdjacentDifference * to compute the difference between adjacent elements. * * @par * @code * #include * // or equivalently * * struct CustomDifference * { * template * __device__ DataType operator()(DataType &lhs, DataType &rhs) * { * return lhs - rhs; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * int num_items; // e.g., 8 * int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] * int *d_output; * .. * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceAdjacentDifference::SubtractRightCopy( * d_temp_storage, temp_storage_bytes, * d_input, d_output, num_items, CustomDifference()); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run operation * cub::DeviceAdjacentDifference::SubtractRightCopy( * d_temp_storage, temp_storage_bytes, * d_input, d_output, num_items, CustomDifference()); * * // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] * // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] * @endcode * * @tparam InputIteratorT * is a model of Input Iterator, * and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then * `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to * a type in `OutputIteratorT`'s set of `value_types`, and the return type * of `x - y` is convertible to a type in `OutputIteratorT`'s set of * `value_types`. * * @tparam OutputIteratorT * is a model of Output Iterator. * * @tparam DifferenceOpT * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s * set of `value_types`. * * @tparam NumItemsT **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input * Pointer to the input sequence * * @param[out] d_output * Pointer to the output sequence * * @param[in] num_items * Number of items in the input sequence * * @param[in] difference_op * The binary function used to compute differences. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { constexpr bool may_alias = false; constexpr bool read_left = false; return AdjacentDifference(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } /** * @brief Subtracts the right element of each adjacent pair of elements * residing within device-accessible memory. * * @ingroup SingleModule * * @par Overview * Calculates the right differences of adjacent elements in `d_input`. That * is, for each iterator `i` in the range * `[d_input, d_input + num_items - 1)`, the result of * `difference_op(*i, *(i + 1))` is assigned to * `*(d_input + (i - d_input))`. * * @par Snippet * The code snippet below illustrates how to use @p DeviceAdjacentDifference * to compute the difference between adjacent elements. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * int num_items; // e.g., 8 * int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceAdjacentDifference::SubtractRight( * d_temp_storage, temp_storage_bytes, d_data, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run operation * cub::DeviceAdjacentDifference::SubtractRight( * d_temp_storage, temp_storage_bytes, d_data, num_items); * * // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] * @endcode * * @tparam RandomAccessIteratorT * is a model of Random Access Iterator, * `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of * `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the * return type of `x - y` should be convertible to a type in * `RandomAccessIteratorT`'s set of `value_types`. * * @tparam DifferenceOpT * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s * set of `value_types`. * * @tparam NumItemsT **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_input * Pointer to the input sequence * * @param[in] num_items * Number of items in the input sequence * * @param[in] difference_op * The binary function used to compute differences * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(void *d_temp_storage, std::size_t &temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { constexpr bool may_alias = true; constexpr bool read_left = false; return AdjacentDifference(d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(void *d_temp_storage, std::size_t &temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_histogram.cuh000066400000000000000000002053261434614775400203370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceHistogram provides device-wide parallel operations for * constructing histogram(s) from a sequence of samples data residing * within device-accessible memory. */ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceHistogram provides device-wide parallel operations for * constructing histogram(s) from a sequence of samples data residing * within device-accessible memory. ![](histogram_logo.png) * @ingroup SingleModule * * @par Overview * A histogram * counts the number of observations that fall into each of the disjoint categories (known as bins). * * @par Usage Considerations * @cdp_class{DeviceHistogram} * */ struct DeviceHistogram { /******************************************************************//** * @name Evenly-segmented bin ranges *********************************************************************/ //@{ /** * @brief Computes an intensity histogram from a sequence of data samples * using equal-width bins. * * @par * - The number of histogram bins is (`num_levels - 1`) * - All bins comprise the same width of sample values: * `(upper_level - lower_level) / (num_levels - 1)` * - The ranges `[d_samples, d_samples + num_samples)` and * `[d_histogram, d_histogram + num_levels - 1)` shall not overlap * in any way. * - `cuda::std::common_type` must be valid, and both LevelT * and SampleT must be valid arithmetic types. The common type must be * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of a six-bin histogram * from a sequence of float samples * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histogram * int num_samples; // e.g., 10 * float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5] * int* d_histogram; // e.g., [ -, -, -, -, -, -] * int num_levels; // e.g., 7 (seven level boundaries for six bins) * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramEven( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_samples); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramEven( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_samples); * * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * @endcode * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading input * samples \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the input sequence of data samples. * * @param[out] d_histogram * The pointer to the histogram counter output array of length * `num_levels - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * * @param[in] lower_level * The lower sample value bound (inclusive) for the lowest histogram bin. * * @param[in] upper_level * The upper sample value bound (exclusive) for the highest histogram bin. * * @param[in] num_samples * The number of input samples (i.e., the length of `d_samples`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_samples, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; CounterT *d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT lower_level1[1] = {lower_level}; LevelT upper_level1[1] = {upper_level}; return MultiHistogramEven<1, 1>(d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, lower_level1, upper_level1, num_samples, static_cast(1), sizeof(SampleT) * num_samples, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_samples, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples, stream); } /** * @brief Computes an intensity histogram from a sequence of data samples * using equal-width bins. * * @par * - A two-dimensional *region of interest* within `d_samples` can be * specified using the `num_row_samples`, `num_rows`, and * `row_stride_bytes` parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`. * - The number of histogram bins is (`num_levels - 1`) * - All bins comprise the same width of sample values: * `(upper_level - lower_level) / (num_levels - 1)` * - For a given row `r` in `[0, num_rows)`, let * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and * `row_end = row_begin + num_row_samples`. The ranges * `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)` * shall not overlap in any way. * - `cuda::std::common_type` must be valid, and both LevelT * and SampleT must be valid arithmetic types. The common type must be * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of a six-bin histogram * from a 2x5 region of interest within a flattened 2x7 array of float samples. * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histogram * int num_row_samples; // e.g., 5 * int num_rows; // e.g., 2; * size_t row_stride_bytes; // e.g., 7 * sizeof(float) * float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, -, -, * // 0.3, 2.9, 2.1, 6.1, 999.5, -, -] * int* d_histogram; // e.g., [ -, -, -, -, -, -] * int num_levels; // e.g., 7 (seven level boundaries for six bins) * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramEven( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_samples, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramEven( * d_temp_storage, temp_storage_bytes, d_samples, d_histogram, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_samples, num_rows, row_stride_bytes); * * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * @endcode * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading * input samples. \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the input sequence of data samples. * * @param[out] d_histogram * The pointer to the histogram counter output array of * length `num_levels - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * * @param[in] lower_level * The lower sample value bound (inclusive) for the lowest histogram bin. * * @param[in] upper_level * The upper sample value bound (exclusive) for the highest histogram bin. * * @param[in] num_row_samples * The number of data samples per row in the region of interest * * @param[in] num_rows * The number of rows in the region of interest * * @param[in] row_stride_bytes * The number of bytes between starts of consecutive rows in * the region of interest * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { CounterT *d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT lower_level1[1] = {lower_level}; LevelT upper_level1[1] = {upper_level}; return MultiHistogramEven<1, 1>(d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, lower_level1, upper_level1, num_row_samples, num_rows, row_stride_bytes, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_samples, num_rows, row_stride_bytes, stream); } /** * @brief Computes per-channel intensity histograms from a sequence of * multi-channel "pixel" data samples using equal-width bins. * * @par * - The input is a sequence of *pixel* structures, where each pixel comprises * a record of `NUM_CHANNELS` consecutive data samples * (e.g., an *RGBA* pixel). * - Of the `NUM_CHANNELS` specified, the function will only compute * histograms for the first `NUM_ACTIVE_CHANNELS` * (e.g., only *RGB* histograms from *RGBA* pixel samples). * - The number of histogram bins for channeli is * `num_levels[i] - 1`. * - For channeli, the range of values for all histogram bins * have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` * - For a given channel `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges * `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap * in any way. * - `cuda::std::common_type` must be valid, and both LevelT * and SampleT must be valid arithmetic types. The common type must be * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of three 256-bin RGB histograms * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histograms * int num_pixels; // e.g., 5 * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), * // (0, 6, 7, 5), (3, 0, 2, 6)] * int* d_histogram[3]; // e.g., three device pointers to three device buffers, * // each allocated with 256 integer counters * int num_levels[3]; // e.g., {257, 257, 257}; * unsigned int lower_level[3]; // e.g., {0, 0, 0}; * unsigned int upper_level[3]; // e.g., {256, 256, 256}; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramEven<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_pixels); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramEven<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_pixels); * * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * @endcode * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) * * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading * input samples. \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the multi-channel input sequence of data samples. * The samples from different channels are assumed to be interleaved * (e.g., an array of 32-bit pixels where each pixel consists of four * *RGBA* 8-bit samples). * * @param[out] d_histogram * The pointers to the histogram counter output arrays, one for each active * channel. For channeli, the allocation length of * `d_histogram[i]` should be `num_levels[i] - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples in * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * * @param[in] lower_level * The lower sample value bound (inclusive) for the lowest histogram bin in * each active channel. * * @param[in] upper_level * The upper sample value bound (exclusive) for the highest histogram bin * in each active channel. * * @param[in] num_pixels * The number of multi-channel pixels * (i.e., the length of `d_samples / NUM_CHANNELS`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; return MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels, static_cast(1), sizeof(SampleT) * NUM_CHANNELS * num_pixels, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels, stream); } /** * @brief Computes per-channel intensity histograms from a sequence of * multi-channel "pixel" data samples using equal-width bins. * * @par * - The input is a sequence of *pixel* structures, where each pixel * comprises a record of `NUM_CHANNELS` consecutive data samples * (e.g., an *RGBA* pixel). * - Of the `NUM_CHANNELS` specified, the function will only compute * histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., only *RGB* * histograms from *RGBA* pixel samples). * - A two-dimensional *region of interest* within `d_samples` can be * specified using the `num_row_samples`, `num_rows`, and * `row_stride_bytes` parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`. * - The number of histogram bins for channeli is * `num_levels[i] - 1`. * - For channeli, the range of values for all histogram * bins have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` * - For a given row `r` in `[0, num_rows)`, and sample `s` in * `[0, num_row_pixels)`, let * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, * `sample_begin = row_begin + s * NUM_CHANNELS`, and * `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For a given channel * `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges * `[sample_begin, sample_end)` and * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap * in any way. * - `cuda::std::common_type` must be valid, and both LevelT * and SampleT must be valid arithmetic types. The common type must be * convertible to `int` and trivially copyable. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of three 256-bin * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 * array of quad-channel *RGBA* pixels (8 bits per channel per pixel). * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input * // samples and output histograms * int num_row_pixels; // e.g., 3 * int num_rows; // e.g., 2 * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] * int* d_histogram[3]; // e.g., three device pointers to three device buffers, * // each allocated with 256 integer counters * int num_levels[3]; // e.g., {257, 257, 257}; * unsigned int lower_level[3]; // e.g., {0, 0, 0}; * unsigned int upper_level[3]; // e.g., {256, 256, 256}; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramEven<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_pixels, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramEven<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_pixels, num_rows, row_stride_bytes); * * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * @endcode * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) * * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading input * samples. \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the multi-channel input sequence of data samples. The * samples from different channels are assumed to be interleaved (e.g., * an array of 32-bit pixels where each pixel consists of four * *RGBA* 8-bit samples). * * @param[out] d_histogram * The pointers to the histogram counter output arrays, one for each * active channel. For channeli, the allocation length * of `d_histogram[i]` should be `num_levels[i] - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples in * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * * @param[in] lower_level * The lower sample value bound (inclusive) for the lowest histogram bin in * each active channel. * * @param[in] upper_level * The upper sample value bound (exclusive) for the highest histogram bin * in each active channel. * * @param[in] num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param[in] num_rows * The number of rows in the region of interest * * @param[in] row_stride_bytes * The number of bytes between starts of consecutive rows in the region of * interest * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; Int2Type is_byte_sample; if ((sizeof(OffsetT) > sizeof(int)) && ((unsigned long long)(num_rows * row_stride_bytes) < (unsigned long long)INT_MAX)) { // Down-convert OffsetT data type return DispatchHistogram::DispatchEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, (int)num_row_pixels, (int)num_rows, (int)(row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } return DispatchHistogram::DispatchEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, (OffsetT)(row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes, stream); } //@} end member group /******************************************************************//** * @name Custom bin ranges *********************************************************************/ //@{ /** * @brief Computes an intensity histogram from a sequence of data samples * using the specified bin boundary levels. * * @par * - The number of histogram bins is (`num_levels - 1`) * - The value range for bini is `[level[i], level[i+1])` * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not * overlap `[d_samples, d_samples + num_samples)` nor * `[d_levels, d_levels + num_levels)` in any way. The ranges * `[d_levels, d_levels + num_levels)` and * `[d_samples, d_samples + num_samples)` may overlap. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of an six-bin histogram * from a sequence of float samples * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input * // samples and output histogram * int num_samples; // e.g., 10 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] * int* d_histogram; // e.g., [ -, -, -, -, -, -] * int num_levels // e.g., 7 (seven level boundaries for six bins) * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramRange( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_samples); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramRange( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_samples); * * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * * @endcode * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading * input samples.\iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the input sequence of data samples. * * @param[out] d_histogram * The pointer to the histogram counter output array of length * `num_levels - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * * @param[in] d_levels * The pointer to the array of boundaries (levels). Bin ranges are defined * by consecutive boundary pairings: lower sample value boundaries are * inclusive and upper sample value boundaries are exclusive. * * @param[in] num_samples * The number of data samples per row in the region of interest * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT *d_levels, OffsetT num_samples, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; CounterT *d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT *d_levels1[1] = {d_levels}; return MultiHistogramRange<1, 1>(d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, d_levels1, num_samples, (OffsetT)1, (size_t)(sizeof(SampleT) * num_samples), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT *d_levels, OffsetT num_samples, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream); } /** * @brief Computes an intensity histogram from a sequence of data samples * using the specified bin boundary levels. * * @par * - A two-dimensional *region of interest* within `d_samples` can be * specified using the `num_row_samples`, `num_rows`, and * `row_stride_bytes` parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`. * - The number of histogram bins is (`num_levels - 1`) * - The value range for bini is `[level[i], level[i+1])` * - For a given row `r` in `[0, num_rows)`, let * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and * `row_end = row_begin + num_row_samples`. The range * `[d_histogram, d_histogram + num_levels - 1)` shall not overlap * `[row_begin, row_end)` nor `[d_levels, d_levels + num_levels)`. * The ranges `[d_levels, d_levels + num_levels)` and `[row_begin, row_end)` * may overlap. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of a six-bin histogram * from a 2x5 region of interest within a flattened 2x7 array of float samples. * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples and * // output histogram * int num_row_samples; // e.g., 5 * int num_rows; // e.g., 2; * int row_stride_bytes; // e.g., 7 * sizeof(float) * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] * int* d_histogram; // e.g., [ -, -, -, -, -, -] * int num_levels // e.g., 7 (seven level boundaries for six bins) * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramRange( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, * num_row_samples, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramRange( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, * num_row_samples, num_rows, row_stride_bytes); * * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * @endcode * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading * input samples. \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the input sequence of data samples. * * @param[out] d_histogram * The pointer to the histogram counter output array of length * `num_levels - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * * @param[in] d_levels * The pointer to the array of boundaries (levels). Bin ranges are defined * by consecutive boundary pairings: lower sample value boundaries are * inclusive and upper sample value boundaries are exclusive. * * @param[in] num_row_samples * The number of data samples per row in the region of interest * * @param[in] num_rows * The number of rows in the region of interest * * @param[in] row_stride_bytes * The number of bytes between starts of consecutive rows in the region * of interest * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT *d_levels, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { CounterT *d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT *d_levels1[1] = {d_levels}; return MultiHistogramRange<1, 1>(d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, d_levels1, num_row_samples, num_rows, row_stride_bytes, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram, int num_levels, LevelT *d_levels, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_samples, num_rows, row_stride_bytes, stream); } /** * @brief Computes per-channel intensity histograms from a sequence of * multi-channel "pixel" data samples using the specified bin * boundary levels. * * @par * - The input is a sequence of *pixel* structures, where each pixel * comprises a record of `NUM_CHANNELS` consecutive data samples * (e.g., an *RGBA* pixel). * - Of the `NUM_CHANNELS` specified, the function will only compute * histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms * from *RGBA* pixel samples). * - The number of histogram bins for channeli is * `num_levels[i] - 1`. * - For channeli, the range of values for all histogram * bins have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the * range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall * not overlap `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` nor * `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. * The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and * `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` may overlap. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of three 4-bin *RGB* * histograms from a quad-channel sequence of *RGBA* pixels * (8 bits per channel per pixel) * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histograms * int num_pixels; // e.g., 5 * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), * // (0, 6, 7, 5),(3, 0, 2, 6)] * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; * int num_levels[3]; // e.g., {5, 5, 5}; * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8] ]; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramRange<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_pixels); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramRange<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_pixels); * * // d_histogram <-- [ [1, 3, 0, 1], * // [3, 0, 0, 2], * // [0, 2, 0, 3] ] * * @endcode * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) * * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading * input samples. \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the multi-channel input sequence of data samples. * The samples from different channels are assumed to be interleaved (e.g., * an array of 32-bit pixels where each pixel consists of four *RGBA* * 8-bit samples). * * @param[out] d_histogram * The pointers to the histogram counter output arrays, one for each active * channel. For channeli, the allocation length of * `d_histogram[i]` should be `num_levels[i] - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples in * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * * @param[in] d_levels * The pointers to the arrays of boundaries (levels), one for each active * channel. Bin ranges are defined by consecutive boundary pairings: lower * sample value boundaries are inclusive and upper sample value boundaries * are exclusive. * * @param[in] num_pixels * The number of multi-channel pixels * (i.e., the length of `d_samples / NUM_CHANNELS`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; return MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, (OffsetT)1, (size_t)(sizeof(SampleT) * NUM_CHANNELS * num_pixels), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream); } /** * @brief Computes per-channel intensity histograms from a sequence of * multi-channel "pixel" data samples using the specified bin boundary * levels. * * @par * - The input is a sequence of *pixel* structures, where each pixel comprises * a record of `NUM_CHANNELS` consecutive data samples * (e.g., an *RGBA* pixel). * - Of the `NUM_CHANNELS` specified, the function will only compute * histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms * from *RGBA* pixel samples). * - A two-dimensional *region of interest* within `d_samples` can be * specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes` * parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`. * - The number of histogram bins for channeli is * `num_levels[i] - 1`. * - For channeli, the range of values for all histogram * bins have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` * - For a given row `r` in `[0, num_rows)`, and sample `s` in * `[0, num_row_pixels)`, let * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, * `sample_begin = row_begin + s * NUM_CHANNELS`, and * `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For given channels * `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range * `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not * overlap `[sample_begin, sample_end)` nor * `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges * `[d_levels[c2], d_levels[c2] + num_levels[c2])` and * `[sample_begin, sample_end)` may overlap. * - @devicestorage * * @par Snippet * The code snippet below illustrates the computation of three 4-bin *RGB* * histograms from a 2x3 region of interest of within a flattened 2x4 array * of quad-channel *RGBA* pixels (8 bits per channel per pixel). * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input * // samples and output histograms * int num_row_pixels; // e.g., 3 * int num_rows; // e.g., 2 * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; * int num_levels[3]; // e.g., {5, 5, 5}; * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8] ]; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramRange<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, * num_row_pixels, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramRange<4, 3>( * d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, * d_levels, num_row_pixels, num_rows, row_stride_bytes); * * // d_histogram <-- [ [2, 3, 0, 1], * // [3, 0, 0, 2], * // [1, 2, 0, 3] ] * * @endcode * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) * * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed * * @tparam SampleIteratorT * **[inferred]** Random-access input iterator type for reading input * samples. \iterator * * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to \p temp_storage_bytes and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_samples * The pointer to the multi-channel input sequence of data samples. The * samples from different channels are assumed to be interleaved (e.g., an * array of 32-bit pixels where each pixel consists of four * *RGBA* 8-bit samples). * * @param[out] d_histogram * The pointers to the histogram counter output arrays, one for each active * channel. For channeli, the allocation length of * `d_histogram[i]` should be `num_levels[i] - 1`. * * @param[in] num_levels * The number of boundaries (levels) for delineating histogram samples in * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * * @param[in] d_levels * The pointers to the arrays of boundaries (levels), one for each active * channel. Bin ranges are defined by consecutive boundary pairings: lower * sample value boundaries are inclusive and upper sample value boundaries * are exclusive. * * @param[in] num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param[in] num_rows * The number of rows in the region of interest * * @param[in] row_stride_bytes * The number of bytes between starts of consecutive rows in the * region of interest * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; Int2Type is_byte_sample; if ((sizeof(OffsetT) > sizeof(int)) && ((unsigned long long)(num_rows * row_stride_bytes) < (unsigned long long)INT_MAX)) { // Down-convert OffsetT data type return DispatchHistogram::DispatchRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, (int)num_row_pixels, (int)num_rows, (int)(row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } return DispatchHistogram::DispatchRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, (OffsetT)(row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_histogram[NUM_ACTIVE_CHANNELS], int num_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes, stream); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_merge_sort.cuh000066400000000000000000001104131434614775400205000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceMergeSort provides device-wide, parallel operations for * computing a merge sort across a sequence of data items residing within * device-accessible memory. * * @ingroup SingleModule * * @par Overview * - DeviceMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types (as * long as a value of these types is a model of [LessThan Comparable]) and * comparison functors, but is slower than DeviceRadixSort when sorting * arithmetic types into ascending/descending order. * - Another difference from RadixSort is the fact that DeviceMergeSort can * handle arbitrary random-access iterators, as shown below. * * @par A Simple Example * @par * The code snippet below illustrates a thrust reverse iterator usage. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * thrust::device_vector d_keys(num_items); * thrust::device_vector d_values(num_items); * // ... * * // Initialize iterator * using KeyIterator = typename thrust::device_vector::iterator; * thrust::reverse_iterator reverse_iter(d_keys.end()); * * // Determine temporary device storage requirements * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairs( * nullptr, * temp_storage_bytes, * reverse_iter, * thrust::raw_pointer_cast(d_values.data()), * num_items, * CustomLess()); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairs( * d_temp_storage, * temp_storage_bytes, * reverse_iter, * thrust::raw_pointer_cast(d_values.data()), * num_items, * CustomLess()); * @endcode * * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ struct DeviceMergeSort { /** * @brief Sorts items using a merge sorting method. * * @par * SortPairs is not guaranteed to be stable. That is, suppose that i and j are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 2, 1, 0, 6] * * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in,out] d_items * Pointer to the input sequence of unsorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * - SortPairsCopy is not guaranteed to be stable. That is, suppose * that `i` and `j` are equivalent: neither one is less than the * other. It is not guaranteed that the relative order of these * two elements will be preserved by sort. * - Input arrays `d_input_keys` and `d_input_items` are not modified. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairsCopy( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairsCopy( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 2, 1, 0, 6] * * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam ValueInputIteratorT * is a model of [Random Access Iterator]. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[in] d_input_items * Pointer to the input sequence of unsorted input values * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[out] d_output_items * Pointer to the output sequence of sorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns `true` if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsCopy(d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, static_cast(nullptr), d_keys, static_cast(nullptr), num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * - SortKeysCopy is not guaranteed to be stable. That is, suppose that `i` * and `j` are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - Input array d_input_keys is not modified. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_input_keys, static_cast(nullptr), d_output_keys, static_cast(nullptr), num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysCopy( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * StableSortPairs is stable: it preserves the relative ordering of equivalent * elements. That is, if x and y are elements such that x precedes y, * and if the two elements are equivalent (neither x < y nor y < x) then * a postcondition of stable_sort is that x still precedes y. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 1, 2, 0, 6] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in,out] d_items * Pointer to the input sequence of unsorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * StableSortKeys is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of stable_sort is that `x` still precedes `y`. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * \par * \code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_partition.cuh000066400000000000000000000704761434614775400203610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DevicePartition provides device-wide, parallel operations for * partitioning sequences of data items residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DevicePartition provides device-wide, parallel operations for * partitioning sequences of data items residing within device-accessible * memory. ![](partition_logo.png) * @ingroup SingleModule * * @par Overview * These operations apply a selection criterion to construct a partitioned * output sequence from items selected/unselected from a specified input * sequence. * * @par Usage Considerations * \cdp_class{DevicePartition} * * @par Performance * \linear_performance{partition} * * @par * The following chart illustrates DevicePartition::If * performance across different CUDA architectures for @p int32 items, * where 50% of the items are randomly selected for the first partition. * \plots_below * * @image html partition_if_int32_50_percent.png * */ struct DevicePartition { /** * @brief Uses the @p d_flags sequence to split the corresponding items from * @p d_in into a partitioned sequence @p d_out. The total number of * items copied into the first partition is written to * @p d_num_selected_out. ![](partition_flags_logo.png) * * @par * - The value type of @p d_flags must be castable to @p bool (e.g., * @p bool, @p char, @p int, etc.). * - Copies of the selected items are compacted into @p d_out and maintain * their original relative ordering, however copies of the unselected * items are compacted into the rear of @p d_out in reverse order. * - The range `[d_out, d_out + num_items)` shall not overlap * `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any * way. The range `[d_in, d_in + num_items)` may overlap * `[d_flags, d_flags + num_items)`. * - \devicestorage * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an @p int device vector. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input, flags, and output * int num_items; // e.g., 8 * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] * int *d_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DevicePartition::Flagged( * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_out, d_num_selected_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DevicePartition::Flagged( * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_out, d_num_selected_out, num_items); * * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] * // d_num_selected_out <-- [4] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading * input items \iterator * * @tparam FlagIterator * **[inferred]** Random-access input iterator type for reading * selection flags \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing * output items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number * of items selected \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[in] d_flags * Pointer to the input sequence of selection flags * * @param[out] d_out * Pointer to the output sequence of partitioned data items * * @param[out] d_num_selected_out * Pointer to the output total number of items selected (i.e., the * offset of the unselected partition) * * @param[in] num_items * Total number of items to select from * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Flagged(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) using DispatchSelectIfT = DispatchSelectIf; return DispatchSelectIfT::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, SelectOp{}, EqualityOp{}, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Flagged(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } /** * @brief Uses the @p select_op functor to split the corresponding items * from @p d_in into a partitioned sequence @p d_out. The total * number of items copied into the first partition is written to * @p d_num_selected_out. ![](partition_logo.png) * * @par * - Copies of the selected items are compacted into @p d_out and maintain * their original relative ordering, however copies of the unselected * items are compacted into the rear of @p d_out in reverse order. * - The range `[d_out, d_out + num_items)` shall not overlap * `[d_in, d_in + num_items)` in any way. * - \devicestorage * * @par Performance * The following charts illustrate saturated partition-if performance across * different CUDA architectures for @p int32 and @p int64 items, * respectively. Items are selected for the first partition with 50% * probability. * * @image html partition_if_int32_50_percent.png * @image html partition_if_int64_50_percent.png * * @par * The following charts are similar, but 5% selection probability for the * first partition: * * @image html partition_if_int32_5_percent.png * @image html partition_if_int64_5_percent.png * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an @p int device vector. * @par * @code * #include * // or equivalently * * // Functor type for selecting values less than some criteria * struct LessThan * { * int compare; * * CUB_RUNTIME_FUNCTION __forceinline__ * explicit LessThan(int compare) : compare(compare) {} * * CUB_RUNTIME_FUNCTION __forceinline__ * bool operator()(const int &a) const * { * return (a < compare); * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] * int *d_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ ] * LessThan select_op(7); * ... * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DevicePartition::If( * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items, select_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DevicePartition::If( * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items, select_op); * * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] * // d_num_selected_out <-- [5] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing output * items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @tparam SelectOp * **[inferred]** Selection functor type having member * `bool operator()(const T &a)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output sequence of partitioned data items * * @param[out] d_num_selected_out * Pointer to the output total number of items selected (i.e., the * offset of the unselected partition) * * @param[in] num_items * Total number of items to select from * * @param[in] select_op * Unary selection operator * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType *; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) using DispatchSelectIfT = DispatchSelectIf; return DispatchSelectIfT::Dispatch(d_temp_storage, temp_storage_bytes, d_in, nullptr, d_out, d_num_selected_out, select_op, EqualityOp{}, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } /** * @brief Uses two functors to split the corresponding items from @p d_in * into a three partitioned sequences @p d_first_part_out * @p d_second_part_out and @p d_unselected_out. * The total number of items copied into the first partition is written * to `d_num_selected_out[0]`, while the total number of items copied * into the second partition is written to `d_num_selected_out[1]`. * * @par * - Copies of the items selected by @p select_first_part_op are compacted * into @p d_first_part_out and maintain their original relative ordering. * - Copies of the items selected by @p select_second_part_op are compacted * into @p d_second_part_out and maintain their original relative ordering. * - Copies of the unselected items are compacted into the * @p d_unselected_out in reverse order. * - The ranges `[d_out, d_out + num_items)`, * `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`, * `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`, * `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`, * shall not overlap in any way. * * @par Snippet * The code snippet below illustrates how this algorithm can partition an * input vector into small, medium, and large items so that the relative * order of items remain deterministic. * * Let's consider any value that doesn't exceed six a small one. On the * other hand, any value that exceeds 50 will be considered a large one. * Since the value used to define a small part doesn't match one that * defines the large part, the intermediate segment is implied. * * These definitions partition a value space into three categories. We want * to preserve the order of items in which they appear in the input vector. * Since the algorithm provides stable partitioning, this is possible. * * Since the number of items in each category is unknown beforehand, we need * three output arrays of num_items elements each. To reduce the memory * requirements, we can combine the output storage for two categories. * * Since each value falls precisely in one category, it's safe to add * "large" values into the head of the shared output vector and the "middle" * values into its tail. To add items into the tail of the output array, we * can use `thrust::reverse_iterator`. * @par * @code * #include * // or equivalently * * // Functor type for selecting values less than some criteria * struct LessThan * { * int compare; * * CUB_RUNTIME_FUNCTION __forceinline__ * explicit LessThan(int compare) : compare(compare) {} * * CUB_RUNTIME_FUNCTION __forceinline__ * bool operator()(const int &a) const * { * return a < compare; * } * }; * * // Functor type for selecting values greater than some criteria * struct GreaterThan * { * int compare; * * CUB_RUNTIME_FUNCTION __forceinline__ * explicit GreaterThan(int compare) : compare(compare) {} * * CUB_RUNTIME_FUNCTION __forceinline__ * bool operator()(const int &a) const * { * return a > compare; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] * int *d_large_and_unselected_out; // e.g., [ , , , , , , , ] * int *d_small_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ , ] * thrust::reverse_iterator unselected_out(d_large_and_unselected_out + num_items); * LessThan small_items_selector(7); * GreaterThan large_items_selector(50); * ... * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DevicePartition::If( * d_temp_storage, temp_storage_bytes, * d_in, d_large_and_medium_out, d_small_out, unselected_out, * d_num_selected_out, num_items, * large_items_selector, small_items_selector); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DevicePartition::If( * d_temp_storage, temp_storage_bytes, * d_in, d_large_and_medium_out, d_small_out, unselected_out, * d_num_selected_out, num_items, * large_items_selector, small_items_selector); * * // d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ] * // d_small_out <-- [ 0, 2, 3, 5, 2, , , ] * // d_num_selected_out <-- [ 1, 5 ] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading * input items \iterator * * @tparam FirstOutputIteratorT * **[inferred]** Random-access output iterator type for writing output * items selected by first operator \iterator * * @tparam SecondOutputIteratorT * **[inferred]** Random-access output iterator type for writing output * items selected by second operator \iterator * * @tparam UnselectedOutputIteratorT * **[inferred]** Random-access output iterator type for writing * unselected items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @tparam SelectFirstPartOp * **[inferred]** Selection functor type having member * `bool operator()(const T &a)` * * @tparam SelectSecondPartOp * **[inferred]** Selection functor type having member * `bool operator()(const T &a)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and * no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_first_part_out * Pointer to the output sequence of data items selected by * @p select_first_part_op * * @param[out] d_second_part_out * Pointer to the output sequence of data items selected by * @p select_second_part_op * * @param[out] d_unselected_out * Pointer to the output sequence of unselected data items * * @param[out] d_num_selected_out * Pointer to the output array with two elements, where total number of * items selected by @p select_first_part_op is stored as * `d_num_selected_out[0]` and total number of items selected by * @p select_second_part_op is stored as `d_num_selected_out[1]`, * respectively * * @param[in] num_items * Total number of items to select from * * @param[in] select_first_part_op * Unary selection operator to select @p d_first_part_out * * @param[in] select_second_part_op * Unary selection operator to select @p d_second_part_out * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { using OffsetT = int; using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf; return DispatchThreeWayPartitionIfT::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If(d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, select_first_part_op, select_second_part_op, stream); } }; /** * @example example_device_partition_flagged.cu * @example example_device_partition_if.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/device/device_radix_sort.cuh000066400000000000000000001523271434614775400205220ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceRadixSort provides device-wide, parallel operations for * computing a radix sort across a sequence of data items residing within * device-accessible memory. */ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceRadixSort provides device-wide, parallel operations for * computing a radix sort across a sequence of data items residing * within device-accessible memory. ![](sorting_logo.png) * @ingroup SingleModule * * @par Overview * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) * arranges items into ascending (or descending) order. The algorithm relies * upon a positional representation for keys, i.e., each key is comprised of an * ordered sequence of symbols (e.g., digits, characters, etc.) specified from * least-significant to most-significant. For a given input sequence of keys * and a set of rules specifying a total ordering of the symbolic alphabet, the * radix sorting method produces a lexicographic ordering of those keys. * * @par Supported Types * DeviceRadixSort can sort all of the built-in C++ numeric primitive types * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` * and `__nv_bfloat16` 16-bit floating-point types. * * @par Floating-Point Special Cases * * - Positive and negative zeros are considered equivalent, and will be treated * as such in the output. * - No special handling is implemented for NaN values; these are sorted * according to their bit representations after any transformations. * * @par Transformations * Although the direct radix sorting method can only be applied to unsigned * integral types, DeviceRadixSort is able to sort signed and floating-point * types via simple bit-wise transformations that ensure lexicographic key * ordering. Additional transformations occur for descending sorts. These * transformations must be considered when restricting the * `[begin_bit, end_bit)` range, as the bitwise transformations will occur * before the bit-range truncation. * * Any transformations applied to the keys prior to sorting are reversed * while writing to the final output buffer. * * \par Type Specific Bitwise Transformations * To convert the input values into a radix-sortable bitwise representation, * the following transformations take place prior to sorting: * * - For unsigned integral values, the keys are used directly. * - For signed integral values, the sign bit is inverted. * - For positive floating point values, the sign bit is inverted. * - For negative floating point values, the full key is inverted. * * For floating point types, positive and negative zero are a special case and * will be considered equivalent during sorting. * * @par Descending Sort Bitwise Transformations * If descending sort is used, the keys are inverted after performing any * type-specific transformations, and the resulting keys are sorted in ascending * order. * * @par Stability * DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are * considered equal and appear in the result in the same order as they appear in * the input. * * @par Usage Considerations * @cdp_class{DeviceRadixSort} * * @par Performance * @linear_performance{radix sort} The following chart illustrates * DeviceRadixSort::SortKeys performance across different CUDA architectures * for uniform-random `uint32` keys. * @plots_below * * @image html lsb_radix_sort_int32_keys.png * */ struct DeviceRadixSort { /******************************************************************//** * \name KeyT-value pairs *********************************************************************/ //@{ /** * @brief Sorts key-value pairs into ascending order. * (`~2N` auxiliary storage required) * * @par * - The contents of the input data are not altered by the sorting operation. * - Pointers to contiguous memory must be used; iterators are not currently * supported. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys_in, d_keys_in + num_items)` * - `[d_keys_out, d_keys_out + num_items)` * - `[d_values_in, d_values_in + num_items)` * - `[d_values_out, d_values_out + num_items)` * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - @devicestorage * * @par Performance * The following charts illustrate saturated sorting performance across * different CUDA architectures for uniform-random `uint32, uint32` and * `uint64, uint64` pairs, respectively. * * @image html lsb_radix_sort_int32_pairs.png * @image html lsb_radix_sort_int64_pairs.png * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [ ... ] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [ ... ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); * * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam ValueT * **[inferred]** ValueT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input data of key data to sort * * @param[out] d_keys_out * Pointer to the sorted output sequence of key data * * @param[in] d_values_in * Pointer to the corresponding input sequence of associated value items * * @param[out] d_values_out * Pointer to the correspondingly-reordered output sequence of associated * value items * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., sizeof(unsigned int) * 8) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream); } /** * @brief Sorts key-value pairs into ascending order. * (`~N` auxiliary storage required) * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the * sorting operation. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys.Current(), d_keys.Current() + num_items)` * - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` * - `[d_values.Current(), d_values.Current() + num_items)` * - `[d_values.Alternate(), d_values.Alternate() + num_items)` * - Upon completion, the sorting operation will update the "current" * indicator within each DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the * number of key bits specified and the targeted device architecture). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageP * - @devicestorage * * @par Performance * The following charts illustrate saturated sorting performance across * different CUDA architectures for uniform-random `uint32, uint32` and * `uint64, uint64` pairs, respectively. * * @image html lsb_radix_sort_int32_pairs.png * @image html lsb_radix_sort_int64_pairs.png * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [ ... ] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [ ... ] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); * * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] * * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam ValueT * **[inferred]** ValueT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to \p temp_storage_bytes and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer * contains the unsorted input values and, upon return, is updated to point * to the sorted output values * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; constexpr bool is_overwrite_okay = true; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream); } /** * @brief Sorts key-value pairs into descending order. * (`~2N` auxiliary storage required). * * @par * - The contents of the input data are not altered by the sorting operation. * - Pointers to contiguous memory must be used; iterators are not currently * supported. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys_in, d_keys_in + num_items)` * - `[d_keys_out, d_keys_out + num_items)` * - `[d_values_in, d_values_in + num_items)` * - `[d_values_out, d_values_out + num_items)` * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - @devicestorage * * @par Performance * Performance is similar to DeviceRadixSort::SortPairs. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [ ... ] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [ ... ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); * * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam ValueT * **[inferred]** ValueT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @param[in] d_keys_in * Pointer to the input data of key data to sort * * @param[out] d_keys_out * Pointer to the sorted output sequence of key data * * @param[in] d_values_in * Pointer to the corresponding input sequence of associated value items * * @param[out] d_values_out * Pointer to the correspondingly-reordered output sequence of associated * value items * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream); } /** * @brief Sorts key-value pairs into descending order. * (`~N` auxiliary storage required). * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the * sorting operation. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys.Current(), d_keys.Current() + num_items)` * - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` * - `[d_values.Current(), d_values.Current() + num_items)` * - `[d_values.Alternate(), d_values.Alternate() + num_items)` * - Upon completion, the sorting operation will update the "current" * indicator within each DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits specified and the targeted device architecture). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageP * - @devicestorage * * @par Performance * Performance is similar to DeviceRadixSort::SortPairs. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [ ... ] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [ ... ] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); * * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam ValueT * **[inferred]** ValueT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer * contains the unsorted input values and, upon return, is updated to point * to the sorted output values * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; constexpr bool is_overwrite_okay = true; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream); } //@} end member group /******************************************************************//** * @name Keys-only *********************************************************************/ //@{ /** * @brief Sorts keys into ascending order. * (`~2N` auxiliary storage required) * * @par * - The contents of the input data are not altered by the sorting operation. * - Pointers to contiguous memory must be used; iterators are not currently * supported. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys_in, d_keys_in + num_items)` * - `[d_keys_out, d_keys_out + num_items)` * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - @devicestorage * * @par Performance * The following charts illustrate saturated sorting performance across * different CUDA architectures for uniform-random `uint32` and `uint64` * keys, respectively. * * @image html lsb_radix_sort_int32_keys.png * @image html lsb_radix_sort_int64_keys.png * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [ ... ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); * * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input data of key data to sort * * @param[out] d_keys_out * Pointer to the sorted output sequence of key data * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); // Null value type DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); } /** * @brief Sorts keys into ascending order. (`~N` auxiliary storage required). * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys.Current(), d_keys.Current() + num_items)` * - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the * number of key bits specified and the targeted device architecture). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageP * - @devicestorage * * @par Performance * The following charts illustrate saturated sorting performance across * different CUDA architectures for uniform-random `uint32` and `uint64` * keys, respectively. * * @image html lsb_radix_sort_int32_keys.png * @image html lsb_radix_sort_int64_keys.png * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [ ... ] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys, num_items); * * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; constexpr bool is_overwrite_okay = true; // Null value type DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream); } /** * @brief Sorts keys into descending order. * (`~2N` auxiliary storage required). * * @par * - The contents of the input data are not altered by the sorting operation. * - Pointers to contiguous memory must be used; iterators are not currently * supported. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys_in, d_keys_in + num_items)` * - `[d_keys_out, d_keys_out + num_items)` * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - @devicestorage * * @par Performance * Performance is similar to DeviceRadixSort::SortKeys. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [ ... ] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); * * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s * * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input data of key data to sort * * @param[out] d_keys_out * Pointer to the sorted output sequence of key data * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); } /** * @brief Sorts keys into descending order. * (`~N` auxiliary storage required). * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys.Current(), d_keys.Current() + num_items)` * - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the * number of key bits specified and the targeted device architecture). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - @devicestorageP * - @devicestorage * * @par Performance * Performance is similar to DeviceRadixSort::SortKeys. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of @p int keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [ ... ] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, num_items); * * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] * @endcode * * @tparam KeyT * **[inferred]** KeyT type * * @tparam NumItemsT * **[inferred]** Type of num_items * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Unsigned integer type for global offsets. using OffsetT = typename detail::ChooseOffsetT::Type; constexpr bool is_overwrite_okay = true; // Null value type DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream); } //@} end member group }; /** * @example example_device_radix_sort.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/device/device_reduce.cuh000066400000000000000000001341631434614775400176110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceReduce provides device-wide, parallel operations for * computing a reduction across a sequence of data items residing within * device-accessible memory. */ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceReduce provides device-wide, parallel operations for computing * a reduction across a sequence of data items residing within * device-accessible memory. ![](reduce_logo.png) * @ingroup SingleModule * * @par Overview * A *reduction* * (or *fold*) uses a binary combining operator to compute a single aggregate * from a sequence of input elements. * * @par Usage Considerations * @cdp_class{DeviceReduce} * * @par Performance * @linear_performance{reduction, reduce-by-key, and run-length encode} * * @par * The following chart illustrates DeviceReduce::Sum * performance across different CUDA architectures for \p int32 keys. * * @image html reduce_int32.png * * @par * The following chart illustrates DeviceReduce::ReduceByKey (summation) * performance across different CUDA architectures for `fp32` values. Segments * are identified by `int32` keys, and have lengths uniformly sampled * from `[1, 1000]`. * * @image html reduce_by_key_fp32_len_500.png * * @par * @plots_below * */ struct DeviceReduce { /** * @brief Computes a device-wide reduction using the specified binary * `reduction_op` functor and initial value `init`. * * @par * - Does not support binary reduction operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet * The code snippet below illustrates a user-defined min-reduction of a * device vector of `int` data elements. * @par * @code * #include * // or equivalently * * // CustomMin functor * struct CustomMin * { * template * __device__ __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-] * CustomMin min_op; * int init; // e.g., INT_MAX * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::Reduce( * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items, min_op, init); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run reduction * cub::DeviceReduce::Reduce( * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items, min_op, init); * * // d_out <-- [0] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam ReductionOpT * **[inferred]** Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * * @tparam T * **[inferred]** Data element type that is convertible to the `value` type * of `InputIteratorT` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in[in] * Pointer to the input sequence of data items * * @param d_out[out] * Pointer to the output aggregate * * @param num_items[in] * Total number of input items (i.e., length of `d_in`) * * @param reduction_op[in] * Binary reduction functor * * @param[in] init * Initial value of the reduction * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Reduce( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } /** * @brief Computes a device-wide sum using the addition (`+`) operator. * * @par * - Uses `0` as the initial value of the reduction. * - Does not support \p + operators that are non-commutative.. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Performance * The following charts illustrate saturated sum-reduction performance across * different CUDA architectures for `int32` and `int64` items, respectively. * * @image html reduce_int32.png * @image html reduce_int64.png * * @par Snippet * The code snippet below illustrates the sum-reduction of a device vector * of `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::Sum( * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sum-reduction * cub::DeviceReduce::Sum( * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // d_out <-- [38] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The output value type using OutputT = cub::detail::non_void_value_t>; using InitT = OutputT; return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), InitT{}, // zero-initialize stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Computes a device-wide minimum using the less-than ('<') operator. * * @par * - Uses `std::numeric_limits::max()` as the initial value of the reduction. * - Does not support `<` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the min-reduction of a device vector of * `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::Min( * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run min-reduction * cub::DeviceReduce::Min( * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // d_out <-- [0] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input value type using InputT = cub::detail::value_t; using InitT = InputT; return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Min(), // replace with // std::numeric_limits::max() when // C++11 support is more prevalent Traits::Max(), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Finds the first device-wide minimum using the less-than ('<') * operator, also returning the index of that item. * * @par * - The output value type of `d_out` is cub::KeyValuePair `` * (assuming the value type of `d_in` is `T`) * - The minimum is written to `d_out.value` and its offset in the input * array is written to `d_out.key`. * - The `{1, std::numeric_limits::max()}` tuple is produced for * zero-length inputs * - Does not support `<` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the argmin-reduction of a device vector * of `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * KeyValuePair *d_out; // e.g., [{-,-}] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::ArgMin( * d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run argmin-reduction * cub::DeviceReduce::ArgMin( * d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); * * // d_out <-- [{5, 0}] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input items * (of some type `T`) \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `cub::KeyValuePair`) \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to \p temp_storage_bytes and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; using InitT = OutputTupleT; // The output value type using OutputValueT = typename OutputTupleT::Value; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // replace with std::numeric_limits::max() when C++11 support is // more prevalent InitT initial_value(1, Traits::Max()); return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Computes a device-wide maximum using the greater-than ('>') operator. * * @par * - Uses `std::numeric_limits::lowest()` as the initial value of the * reduction. * - Does not support `>` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the max-reduction of a device vector of * `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::Max( * d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run max-reduction * cub::DeviceReduce::Max( * d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); * * // d_out <-- [9] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input value type using InputT = cub::detail::value_t; using InitT = InputT; return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Max(), // replace with // std::numeric_limits::lowest() // when C++11 support is more // prevalent Traits::Lowest(), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Finds the first device-wide maximum using the greater-than ('>') * operator, also returning the index of that item * * @par * - The output value type of `d_out` is cub::KeyValuePair `` * (assuming the value type of `d_in` is `T`) * - The maximum is written to `d_out.value` and its offset in the input * array is written to `d_out.key`. * - The `{1, std::numeric_limits::lowest()}` tuple is produced for * zero-length inputs * - Does not support `>` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the argmax-reduction of a device vector * of `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * KeyValuePair *d_out; // e.g., [{-,-}] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::ArgMax( * d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run argmax-reduction * cub::DeviceReduce::ArgMax( * d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); * * // d_out <-- [{6, 9}] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input items * (of some type \p T) \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `cub::KeyValuePair`) \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; // The output value type using OutputValueT = typename OutputTupleT::Value; using InitT = OutputTupleT; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // replace with std::numeric_limits::lowest() when C++11 support is // more prevalent InitT initial_value(1, Traits::Lowest()); return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Reduces segments of values, where segments are demarcated by * corresponding runs of identical keys. * * @par * This operation computes segmented reductions within `d_values_in` using * the specified binary `reduction_op` functor. The segments are identified * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal * ranges of consecutive, identical keys. For the *i*th run * encountered, the first key of the run and the corresponding value * aggregate of that run are written to `d_unique_out[i] and * `d_aggregates_out[i]`, respectively. The total number of runs encountered * is written to `d_num_runs_out`. * * @par * - The `==` equality operator is used to determine whether keys are * equivalent * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - Let `out` be any of * `[d_unique_out, d_unique_out + *d_num_runs_out)` * `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)` * `d_num_runs_out`. The ranges represented by `out` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_values_in, d_values_in + num_items)` nor `out` in any way. * - @devicestorage * * @par Performance * The following chart illustrates reduction-by-key (sum) performance across * different CUDA architectures for `fp32` and `fp64` values, respectively. * Segments are identified by `int32` keys, and have lengths uniformly * sampled from `[1, 1000]`. * * @image html reduce_by_key_fp32_len_500.png * @image html reduce_by_key_fp64_len_500.png * * @par * The following charts are similar, but with segment lengths uniformly * sampled from [1,10]: * * @image html reduce_by_key_fp32_len_5.png * @image html reduce_by_key_fp64_len_5.png * * @par Snippet * The code snippet below illustrates the segmented reduction of `int` values * grouped by runs of associated `int` keys. * @par * @code * #include * // or equivalently * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] * int *d_num_runs_out; // e.g., [-] * CustomMin reduction_op; * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::ReduceByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_unique_out, d_values_in, * d_aggregates_out, d_num_runs_out, reduction_op, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run reduce-by-key * cub::DeviceReduce::ReduceByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_unique_out, d_values_in, * d_aggregates_out, d_num_runs_out, reduction_op, num_items); * * // d_unique_out <-- [0, 2, 9, 5, 8] * // d_aggregates_out <-- [0, 1, 6, 2, 4] * // d_num_runs_out <-- [5] * @endcode * * @tparam KeysInputIteratorT * **[inferred]** Random-access input iterator type for reading input * keys \iterator * * @tparam UniqueOutputIteratorT * **[inferred]** Random-access output iterator type for writing unique * output keys \iterator * * @tparam ValuesInputIteratorT * **[inferred]** Random-access input iterator type for reading input * values \iterator * * @tparam AggregatesOutputIterator * **[inferred]** Random-access output iterator type for writing output * value aggregates \iterator * * @tparam NumRunsOutputIteratorT * **[inferred]** Output iterator type for recording the number of runs * encountered \iterator * * @tparam ReductionOpT * **[inferred]*8 Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[out] d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param[in] d_values_in * Pointer to the input sequence of corresponding values * * @param[out] d_aggregates_out * Pointer to the output sequence of value aggregates * (one aggregate per run) * * @param[out] d_num_runs_out * Pointer to total number of runs encountered * (i.e., the length of `d_unique_out`) * * @param[in] reduction_op * Binary reduction functor * * @param[in] num_items * Total number of associated key+value pairs * (i.e., the length of `d_in_keys` and `d_in_values`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // FlagT iterator type (not used) // Selection op (not used) // Default == operator typedef Equality EqualityOp; return DispatchReduceByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, EqualityOp(), reduction_op, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items, stream); } }; /** * @example example_device_reduce.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/device/device_run_length_encode.cuh000066400000000000000000000430111434614775400220130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceRunLengthEncode provides device-wide, parallel operations * for computing a run-length encoding across a sequence of data items * residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceRunLengthEncode provides device-wide, parallel operations for * demarcating "runs" of same-valued items within a sequence residing * within device-accessible memory. ![](run_length_encode_logo.png) * @ingroup SingleModule * * @par Overview * A *run-length encoding* * computes a simple compressed representation of a sequence of input elements * such that each maximal "run" of consecutive same-valued data items is * encoded as a single data value along with a count of the elements in that * run. * * @par Usage Considerations * @cdp_class{DeviceRunLengthEncode} * * @par Performance * @linear_performance{run-length encode} * * @par * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode * performance across different CUDA architectures for `int32` items. * Segments have lengths uniformly sampled from `[1, 1000]`. * * @image html rle_int32_len_500.png * * @par * @plots_below */ struct DeviceRunLengthEncode { /** * @brief Computes a run-length encoding of the sequence \p d_in. * * @par * - For the *i*th run encountered, the first key of the run and * its length are written to `d_unique_out[i]` and `d_counts_out[i]`, * respectively. * - The total number of runs encountered is written to `d_num_runs_out`. * - The `==` equality operator is used to determine whether values are * equivalent * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_unique_out, d_unique_out + *d_num_runs_out)` * - `[d_counts_out, d_counts_out + *d_num_runs_out)` * - `[d_num_runs_out, d_num_runs_out + 1)` * - `[d_in, d_in + num_items)` * - @devicestorage * * @par Performance * The following charts illustrate saturated encode performance across * different CUDA architectures for `int32` and `int64` items, respectively. * Segments have lengths uniformly sampled from [1,1000]. * * @image html rle_int32_len_500.png * @image html rle_int64_len_500.png * * @par * The following charts are similar, but with segment lengths uniformly * sampled from [1,10]: * * @image html rle_int32_len_5.png * @image html rle_int64_len_5.png * * @par Snippet * The code snippet below illustrates the run-length encoding of a sequence * of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] * int *d_unique_out; // e.g., [ , , , , , , , ] * int *d_counts_out; // e.g., [ , , , , , , , ] * int *d_num_runs_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRunLengthEncode::Encode( * d_temp_storage, temp_storage_bytes, * d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run encoding * cub::DeviceRunLengthEncode::Encode( * d_temp_storage, temp_storage_bytes, * d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); * * // d_unique_out <-- [0, 2, 9, 5, 8] * // d_counts_out <-- [1, 2, 1, 3, 1] * // d_num_runs_out <-- [5] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam UniqueOutputIteratorT * **[inferred]** Random-access output iterator type for writing unique * output items \iterator * * @tparam LengthsOutputIteratorT * **[inferred]** Random-access output iterator type for writing output * counts \iterator * * @tparam NumRunsOutputIteratorT * **[inferred]** Output iterator type for recording the number of runs * encountered \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of keys * * @param[out] d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param[out] d_counts_out * Pointer to the output sequence of run-lengths (one count per run) * * @param[out] d_num_runs_out * Pointer to total number of runs * * @param[in] num_items * Total number of associated key+value pairs (i.e., the length of * `d_in_keys` and `d_in_values`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Encode(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, LengthsOutputIteratorT d_counts_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType *; // FlagT iterator type (not used) using SelectOp = NullType; // Selection op (not used) using EqualityOp = Equality; // Default == operator using ReductionOp = cub::Sum; // Value reduction operator // The lengths output value type using LengthT = cub::detail::non_void_value_t; // Generator type for providing 1s values for run-length reduction using LengthsInputIteratorT = ConstantInputIterator; return DispatchReduceByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, LengthsInputIteratorT( (LengthT)1), d_counts_out, d_num_runs_out, EqualityOp(), ReductionOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Encode(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, LengthsOutputIteratorT d_counts_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream); } /** * @brief Enumerates the starting offsets and lengths of all non-trivial runs * (of `length > 1`) of same-valued keys in the sequence `d_in`. * * @par * - For the *i*th non-trivial run, the run's starting offset and * its length are written to `d_offsets_out[i]` and `d_lengths_out[i]`, * respectively. * - The total number of runs encountered is written to `d_num_runs_out`. * - The `==` equality operator is used to determine whether values are * equivalent * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_offsets_out, d_offsets_out + *d_num_runs_out)` * - `[d_lengths_out, d_lengths_out + *d_num_runs_out)` * - `[d_num_runs_out, d_num_runs_out + 1)` * - `[d_in, d_in + num_items)` * - @devicestorage * * @par Performance * * @par Snippet * The code snippet below illustrates the identification of non-trivial runs * within a sequence of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] * int *d_offsets_out; // e.g., [ , , , , , , , ] * int *d_lengths_out; // e.g., [ , , , , , , , ] * int *d_num_runs_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceRunLengthEncode::NonTrivialRuns( * d_temp_storage, temp_storage_bytes, * d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run encoding * cub::DeviceRunLengthEncode::NonTrivialRuns( * d_temp_storage, temp_storage_bytes, * d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); * * // d_offsets_out <-- [1, 4] * // d_lengths_out <-- [2, 3] * // d_num_runs_out <-- [2] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OffsetsOutputIteratorT * **[inferred]** Random-access output iterator type for writing run-offset * values \iterator * * @tparam LengthsOutputIteratorT * **[inferred]** Random-access output iterator type for writing run-length * values \iterator * * @tparam NumRunsOutputIteratorT * **[inferred]** Output iterator type for recording the number of runs * encountered \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to input sequence of data items * * @param[out] d_offsets_out * Pointer to output sequence of run-offsets * (one offset per non-trivial run) * * @param[out] d_lengths_out * Pointer to output sequence of run-lengths * (one count per non-trivial run) * * @param[out] d_num_runs_out * Pointer to total number of runs (i.e., length of `d_offsets_out`) * * @param[in] num_items * Total number of associated key+value pairs (i.e., the length of * `d_in_keys` and `d_in_values`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t NonTrivialRuns(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using EqualityOp = Equality; // Default == operator return DeviceRleDispatch::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t NonTrivialRuns(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_scan.cuh000066400000000000000000002461051434614775400172660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceScan provides device-wide, parallel operations for * computing a prefix scan across a sequence of data items residing * within device-accessible memory. */ #pragma once #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceScan provides device-wide, parallel operations for computing a * prefix scan across a sequence of data items residing within * device-accessible memory. ![](device_scan.png) * * @ingroup SingleModule * * @par Overview * Given a sequence of input elements and a binary reduction operator, a * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output * sequence where each element is computed to be the reduction of the elements * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan * with the addition operator. The term *inclusive* indicates that the * *i*th output reduction incorporates the *i*th input. * The term *exclusive* indicates the *i*th input is not * incorporated into the *i*th output reduction. When the input and * output sequences are the same, the scan is performed in-place. * * @par * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our * *"decoupled look-back"* algorithm for performing global prefix scan with * only a single pass through the input data, as described in our 2016 technical * report [1]. The central idea is to leverage a small, constant factor of * redundant work in order to overlap the latencies of global prefix * propagation with local computation. As such, our algorithm requires only * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and * typically proceeds at "memcpy" speeds. Our algorithm supports inplace * operations. * * @par * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) * * @par Usage Considerations * @cdp_class{DeviceScan} * * @par Performance * @linear_performance{prefix scan} * * @par * The following chart illustrates DeviceScan::ExclusiveSum performance across * different CUDA architectures for `int32` keys. * @plots_below * * @image html scan_int32.png * */ struct DeviceScan { /******************************************************************//** * \name Exclusive scans *********************************************************************/ //@{ /** * @brief Computes a device-wide exclusive prefix sum. The value of `0` is * applied as the initial value, and is assigned to `*d_out`. * * @par * - Supports non-commutative sum operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - When `d_in` and `d_out` are equal, the scan is performed in-place. The * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Performance * The following charts illustrate saturated exclusive sum performance across * different CUDA architectures for `int32` and `int64` items, respectively. * * @image html scan_int32.png * @image html scan_int64.png * * @par Snippet * The code snippet below illustrates the exclusive prefix sum of an `int` * device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [ , , , , , , ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveSum( * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix sum * cub::DeviceScan::ExclusiveSum( * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // d_out <-- [0, 8, 14, 21, 26, 29, 29] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Random-access iterator to the input sequence of data items * * @param[out] d_out * Random-access iterator to the output sequence of data items * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; using InitT = cub::detail::value_t; // Initial value InitT init_value{}; return DispatchScan< InputIteratorT, OutputIteratorT, Sum, detail::InputValue, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), detail::InputValue(init_value), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Computes a device-wide exclusive prefix sum in-place. The value of * `0` is applied as the initial value, and is assigned to `*d_data`. * * @par * - Supports non-commutative sum operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - @devicestorage * * @par Performance * The following charts illustrate saturated exclusive sum performance across * different CUDA architectures for `int32` and `int64` items, respectively. * * @image html scan_int32.png * @image html scan_int64.png * * @par Snippet * The code snippet below illustrates the exclusive prefix sum of an `int` * device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveSum( * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix sum * cub::DeviceScan::ExclusiveSum( * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // d_data <-- [0, 8, 14, 21, 26, 29, 29] * * @endcode * * @tparam IteratorT * **[inferred]** Random-access iterator type for reading scan * inputs and wrigin scan outputs * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Random-access iterator to the sequence of data items * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } /** * @brief Computes a device-wide exclusive prefix scan using the specified * binary `scan_op` functor. The `init_value` value is applied as * the initial value, and is assigned to `*d_out`. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - When `d_in` and `d_out` are equal, the scan is performed in-place. The * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [ , , , , , , ] * CustomMin min_op; * ... * * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, (int) INT_MAX, num_items); * * // Allocate temporary storage for exclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, (int) INT_MAX, num_items); * * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @tparam InitValueT * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Random-access iterator to the input sequence of data items * * @param[out] d_out * Random-access iterator to the output sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to *d_out) * * @param[in] num_items * Total number of input items (i.e., the length of \p d_in) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int ; return DispatchScan, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue( init_value), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } /** * @brief Computes a device-wide exclusive prefix scan using the specified * binary `scan_op` functor. The `init_value` value is applied as * the initial value, and is assigned to `*d_data`. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - @devicestorage * * @par Snippet * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * CustomMin min_op; * ... * * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_data, min_op, (int) INT_MAX, num_items); * * // Allocate temporary storage for exclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_data, min_op, (int) INT_MAX, num_items); * * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * * @tparam IteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @tparam InitValueT * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Random-access iterator to the sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to *d_out) * * @param[in] num_items * Total number of input items (i.e., the length of \p d_in) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } /** * @brief Computes a device-wide exclusive prefix scan using the specified * binary `scan_op` functor. The `init_value` value is provided as * a future value. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - When `d_in` and `d_out` are equal, the scan is performed in-place. The * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [ , , , , , , ] * int *d_init_iter; // e.g., INT_MAX * CustomMin min_op; * * auto future_init_value = * cub::FutureValue(d_init_iter); * * ... * * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, future_init_value, num_items); * * // Allocate temporary storage for exclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, future_init_value, num_items); * * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @tparam InitValueT * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to `*d_out`) * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchScan, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue( init_value), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } /** * @brief Computes a device-wide exclusive prefix scan using the specified * binary `scan_op` functor. The `init_value` value is provided as * a future value. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - @devicestorage * * @par Snippet * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_init_iter; // e.g., INT_MAX * CustomMin min_op; * * auto future_init_value = * cub::FutureValue(d_init_iter); * * ... * * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_data, min_op, future_init_value, num_items); * * // Allocate temporary storage for exclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( * d_temp_storage, temp_storage_bytes, * d_data, min_op, future_init_value, num_items); * * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * * @tparam IteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @tparam InitValueT * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @param[in,out] d_data * Pointer to the sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to `*d_out`) * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } //@} end member group /******************************************************************//** * @name Inclusive scans *********************************************************************/ //@{ /** * @brief Computes a device-wide inclusive prefix sum. * * @par * - Supports non-commutative sum operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - When `d_in` and `d_out` are equal, the scan is performed in-place. The * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet * The code snippet below illustrates the inclusive prefix sum of an `int` * device vector. * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [ , , , , , , ] * ... * * // Determine temporary device storage requirements for inclusive * // prefix sum * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveSum( * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // Allocate temporary storage for inclusive prefix sum * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run inclusive prefix sum * cub::DeviceScan::InclusiveSum( * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // d_out <-- [8, 14, 21, 26, 29, 29, 38] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Random-access iterator to the input sequence of data items * * @param[out] d_out * Random-access iterator to the output sequence of data items * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchScan::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } /** * @brief Computes a device-wide inclusive prefix sum in-place. * * @par * - Supports non-commutative sum operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - @devicestorage * * @par Snippet * The code snippet below illustrates the inclusive prefix sum of an `int` * device vector. * * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Determine temporary device storage requirements for inclusive * // prefix sum * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveSum( * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // Allocate temporary storage for inclusive prefix sum * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run inclusive prefix sum * cub::DeviceScan::InclusiveSum( * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // d_data <-- [8, 14, 21, 26, 29, 29, 38] * * @endcode * * @tparam IteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Random-access iterator to the sequence of data items * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } /** * @brief Computes a device-wide inclusive prefix scan using the specified * binary `scan_op` functor. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - When `d_in` and `d_out` are equal, the scan is performed in-place. The * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet * The code snippet below illustrates the inclusive prefix min-scan of an * `int` device vector. * * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [ , , , , , , ] * CustomMin min_op; * ... * * // Determine temporary device storage requirements for inclusive * // prefix scan * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, num_items); * * // Allocate temporary storage for inclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run inclusive prefix min-scan * cub::DeviceScan::InclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, num_items); * * // d_out <-- [8, 6, 6, 5, 3, 0, 0] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @param[in] * d_temp_storage Device-accessible allocation of temporary storage. * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Random-access iterator to the input sequence of data items * * @param[out] d_out * Random-access iterator to the output sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, int num_items, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchScan::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream); } /** * @brief Computes a device-wide inclusive prefix scan using the specified * binary `scan_op` functor. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - @devicestorage * * @par Snippet * The code snippet below illustrates the inclusive prefix min-scan of an * `int` device vector. * * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * CustomMin min_op; * ... * * // Determine temporary device storage requirements for inclusive * // prefix scan * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveScan( * d_temp_storage, temp_storage_bytes, * d_data, min_op, num_items); * * // Allocate temporary storage for inclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run inclusive prefix min-scan * cub::DeviceScan::InclusiveScan( * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, num_items); * * // d_data <-- [8, 6, 6, 5, 3, 0, 0] * * @endcode * * @tparam IteratorT * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @param[in] * d_temp_storage Device-accessible allocation of temporary storage. * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_data * Random-access iterator to the sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, int num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream); } /** * @brief Computes a device-wide exclusive prefix sum-by-key with key equality * defined by `equality_op`. The value of `0` is applied as the initial * value, and is assigned to the beginning of each segment in * `d_values_out`. * * @par * - Supports non-commutative sum operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - `d_keys_in` may equal `d_values_out` but the range * `[d_keys_in, d_keys_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - `d_values_in` may equal `d_values_out` but the range * `[d_values_in, d_values_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - @devicestorage * * @par Snippet * The code snippet below illustrates the exclusive prefix sum-by-key of an * `int` device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_values_out; // e.g., [ , , , , , , ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveSumByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix sum * cub::DeviceScan::ExclusiveSumByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] * * @endcode * * @tparam KeysInputIteratorT * **[inferred]** Random-access input iterator type for reading scan keys * inputs \iterator * * @tparam ValuesInputIteratorT * **[inferred]** Random-access input iterator type for reading scan * values inputs \iterator * * @tparam ValuesOutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * values outputs \iterator * * @tparam EqualityOpT * **[inferred]** Functor type having member * `T operator()(const T &a, const T &b)` for binary operations that * defines the equality of keys * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Random-access input iterator to the input sequence of key items * * @param[in] d_values_in * Random-access input iterator to the input sequence of value items * * @param[out] d_values_out * Random-access output iterator to the output sequence of value items * * @param[in] num_items * Total number of input items (i.e., the length of `d_keys_in` and * `d_values_in`) * * @param[in] equality_op * Binary functor that defines the equality of keys. * Default is cub::Equality(). * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; using InitT = cub::detail::value_t; // Initial value InitT init_value{}; return DispatchScanByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, Sum(), init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } /** * @brief Computes a device-wide exclusive prefix scan-by-key using the * specified binary `scan_op` functor. The key equality is defined by * `equality_op`. The `init_value` value is applied as the initial * value, and is assigned to the beginning of each segment in * `d_values_out`. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - `d_keys_in` may equal `d_values_out` but the range * `[d_keys_in, d_keys_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - `d_values_in` may equal `d_values_out` but the range * `[d_values_in, d_values_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - @devicestorage * * @par Snippet * The code snippet below illustrates the exclusive prefix min-scan-by-key of * an `int` device vector * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // CustomEqual functor * struct CustomEqual * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return a == b; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_values_out; // e.g., [ , , , , , , ] * CustomMin min_op; * CustomEqual equality_op; * ... * * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScanByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, min_op, * (int) INT_MAX, num_items, equality_op); * * // Allocate temporary storage for exclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScanByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, min_op, * (int) INT_MAX, num_items, equality_op); * * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0] * * @endcode * * @tparam KeysInputIteratorT * **[inferred]** Random-access input iterator type for reading scan keys * inputs \iterator * * @tparam ValuesInputIteratorT * **[inferred]** Random-access input iterator type for reading scan values * inputs \iterator * * @tparam ValuesOutputIteratorT * **[inferred]** Random-access output iterator type for writing scan values * outputs \iterator * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @tparam InitValueT * **[inferred]** Type of the `init_value` value used in Binary scan * functor type having member `T operator()(const T &a, const T &b)` * * @tparam EqualityOpT * **[inferred]** Functor type having member * `T operator()(const T &a, const T &b)` for binary operations that * defines the equality of keys * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Random-access input iterator to the input sequence of key items * * @param[in] d_values_in * Random-access input iterator to the input sequence of value items * * @param[out] d_values_out * Random-access output iterator to the output sequence of value items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to the * beginning of each segment in `d_values_out`) * * @param[in] num_items * Total number of input items (i.e., the length of `d_keys_in` and * `d_values_in`) * * @param[in] equality_op * Binary functor that defines the equality of keys. * Default is cub::Equality(). * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitValueT init_value, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int ; return DispatchScanByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitValueT init_value, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, init_value, num_items, equality_op, stream); } /** * @brief Computes a device-wide inclusive prefix sum-by-key with key * equality defined by `equality_op`. * * @par * - Supports non-commutative sum operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - `d_keys_in` may equal `d_values_out` but the range * `[d_keys_in, d_keys_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - `d_values_in` may equal `d_values_out` but the range * `[d_values_in, d_values_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - @devicestorage * * @par Snippet * The code snippet below illustrates the inclusive prefix sum-by-key of an * `int` device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_values_out; // e.g., [ , , , , , , ] * ... * * // Determine temporary device storage requirements for inclusive prefix sum * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveSumByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // Allocate temporary storage for inclusive prefix sum * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run inclusive prefix sum * cub::DeviceScan::InclusiveSumByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // d_out <-- [8, 14, 7, 12, 15, 0, 9] * * @endcode * * @tparam KeysInputIteratorT * **[inferred]** Random-access input iterator type for reading scan * keys inputs \iterator * * @tparam ValuesInputIteratorT * **[inferred]** Random-access input iterator type for reading scan * values inputs \iterator * * @tparam ValuesOutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * values outputs \iterator * * @tparam EqualityOpT * **[inferred]** Functor type having member * `T operator()(const T &a, const T &b)` for binary operations that * defines the equality of keys * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Random-access input iterator to the input sequence of key items * * @param[in] d_values_in * Random-access input iterator to the input sequence of value items * * @param[out] d_values_out * Random-access output iterator to the output sequence of value items * * @param[in] num_items * Total number of input items (i.e., the length of `d_keys_in` and * `d_values_in`) * * @param[in] equality_op * Binary functor that defines the equality of keys. * Default is cub::Equality(). * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int ; return DispatchScanByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, Sum(), NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } /** * @brief Computes a device-wide inclusive prefix scan-by-key using the * specified binary `scan_op` functor. The key equality is defined * by `equality_op`. * * @par * - Supports non-commutative scan operators. * - Results are not deterministic for pseudo-associative operators (e.g., * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. * - `d_keys_in` may equal `d_values_out` but the range * `[d_keys_in, d_keys_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - `d_values_in` may equal `d_values_out` but the range * `[d_values_in, d_values_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - @devicestorage * * @par Snippet * The code snippet below illustrates the inclusive prefix min-scan-by-key * of an `int` device vector. * @par * @code * #include // or equivalently * #include // for INT_MAX * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // CustomEqual functor * struct CustomEqual * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return a == b; * } * }; * * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_values_out; // e.g., [ , , , , , , ] * CustomMin min_op; * CustomEqual equality_op; * ... * * // Determine temporary device storage requirements for inclusive prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveScanByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); * * // Allocate temporary storage for inclusive prefix scan * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run inclusive prefix min-scan * cub::DeviceScan::InclusiveScanByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); * * // d_out <-- [8, 6, 7, 5, 3, 0, 0] * * @endcode * * @tparam KeysInputIteratorT * **[inferred]** Random-access input iterator type for reading scan keys * inputs \iterator * * @tparam ValuesInputIteratorT * **[inferred]** Random-access input iterator type for reading scan * values inputs \iterator * * @tparam ValuesOutputIteratorT * **[inferred]** Random-access output iterator type for writing scan * values outputs \iterator * * @tparam ScanOp * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * * @tparam EqualityOpT * **[inferred]** Functor type having member * `T operator()(const T &a, const T &b)` for binary operations that * defines the equality of keys * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Random-access input iterator to the input sequence of key items * * @param[in] d_values_in * Random-access input iterator to the input sequence of value items * * @param[out] d_values_out * Random-access output iterator to the output sequence of value items * * @param[in] scan_op * Binary scan functor * * @param[in] num_items * Total number of input items (i.e., the length of `d_keys_in` and * `d_values_in`) * * @param[in] equality_op * Binary functor that defines the equality of keys. * Default is cub::Equality(). * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchScanByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream); } //@} end member group }; /** * @example example_device_scan.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/device/device_segmented_radix_sort.cuh000066400000000000000000002171721434614775400225550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceSegmentedRadixSort provides device-wide, parallel * operations for computing a batched radix sort across multiple, * non-overlapping sequences of data items residing within * device-accessible memory. */ #pragma once #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations * for computing a batched radix sort across multiple, non-overlapping * sequences of data items residing within device-accessible memory. * ![](segmented_sorting_logo.png) * @ingroup SegmentedModule * * @par Overview * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) * arranges items into ascending (or descending) order. The algorithm relies * upon a positional representation for keys, i.e., each key is comprised of an * ordered sequence of symbols (e.g., digits, characters, etc.) specified from * least-significant to most-significant. For a given input sequence of keys * and a set of rules specifying a total ordering of the symbolic alphabet, the * radix sorting method produces a lexicographic ordering of those keys. * * @par See Also * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See * that algorithm's documentation for more information. * * @par Segments are not required to be contiguous. Any element of input(s) or * output(s) outside the specified segments will not be accessed nor modified. * * @par Usage Considerations * @cdp_class{DeviceSegmentedRadixSort} * */ struct DeviceSegmentedRadixSort { /******************************************************************//** * @name Key-value pairs *********************************************************************/ //@{ /** * @brief Sorts segments of key-value pairs into ascending order. * (`~2N` auxiliary storage required) * * @par * - The contents of the input data are not altered by the sorting operation * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys with associated vector of * `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam ValueT * **[inferred]** Value type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] d_values_in * Device-accessible pointer to the corresponding input sequence of * associated value items * * @param[out] d_values_out * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } /** * @brief Sorts segments of key-value pairs into ascending order. * (`~N` auxiliary storage required) * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the * sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within each DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits specified and the targeted device architecture). * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and yield * a corresponding performance improvement. * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, * `d_values.Alternate()[i]` will not be accessed nor modified. * - @devicestorageP * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys with associated vector of * `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] * * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam ValueT * **[inferred]** Value type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer * contains the unsorted input values and, upon return, is updated to point * to the sorted output values * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } /** * @brief Sorts segments of key-value pairs into descending order. * (`~2N` auxiliary storage required). * * @par * - The contents of the input data are not altered by the sorting operation * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys with associated vector of * `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam ValueT * **[inferred]** Value type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] d_values_in * Device-accessible pointer to the corresponding input sequence of * associated value items * * @param[out] d_values_out * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th * is considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } /** * @brief Sorts segments of key-value pairs into descending order. * (`~N` auxiliary storage required). * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the * sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within each DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits specified and the targeted device architecture). * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, * `d_values.Alternate()[i]` will not be accessed nor modified. * not to be modified. * - @devicestorageP * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys with associated vector of * `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam ValueT * **[inferred]** Value type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer * contains the unsorted input values and, upon return, is updated to point * to the sorted output values * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th * is considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //@} end member group /******************************************************************//** * @name Keys-only *********************************************************************/ //@{ /** * @brief Sorts segments of keys into ascending order. * (`~2N` auxiliary storage required) * * @par * - The contents of the input data are not altered by the sorting operation * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not * be accessed nor modified. * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // Null value type DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } /** * @brief Sorts segments of keys into ascending order. (~N auxiliary storage required). * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the * number of key bits specified and the targeted device architecture). * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * - @devicestorageP * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] * * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1` <= d_begin_offsets[i]`, the *i*th * is considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) * needed for key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // Null value type DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } /** * @brief Sorts segments of keys into descending order. * (`~2N` auxiliary storage required). * * @par * - The contents of the input data are not altered by the sorting operation * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not * be accessed nor modified. * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] * * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., sizeof(unsigned int) * 8) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } /** * @brief Sorts segments of keys into descending order. * (`~N` auxiliary storage required). * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the * number of key bits specified and the targeted device architecture). * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * - @devicestorageP * - @devicestorage * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedRadixSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * @endcode * * @tparam KeyT * **[inferred]** Key type * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*th is * considered empty. * * @param[in] begin_bit * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * * @param[in] end_bit * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // Null value type DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_segmented_reduce.cuh000066400000000000000000001326431434614775400216450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations * for computing a batched reduction across multiple sequences of data * items residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceSegmentedReduce provides device-wide, parallel operations for * computing a reduction across multiple sequences of data items * residing within device-accessible memory. ![](reduce_logo.png) * @ingroup SegmentedModule * * @par Overview * A *reduction* * (or *fold*) uses a binary combining operator to compute a single aggregate * from a sequence of input elements. * * @par Usage Considerations * @cdp_class{DeviceSegmentedReduce} * */ struct DeviceSegmentedReduce { /** * @brief Computes a device-wide segmented reduction using the specified * binary `reduction_op` functor. * * @par * - Does not support binary reduction operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Let `s` be in `[0, num_segments)`. The range * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet * The code snippet below illustrates a custom min-reduction of a device * vector of `int` data elements. * @par * @code * #include * // or equivalently * * // CustomMin functor * struct CustomMin * { * template * CUB_RUNTIME_FUNCTION __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-, -, -] * CustomMin min_op; * int initial_value; // e.g., INT_MAX * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedReduce::Reduce( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run reduction * cub::DeviceSegmentedReduce::Reduce( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); * * // d_out <-- [6, INT_MAX, 0] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @tparam ReductionOp * **[inferred]** Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * * @tparam T * **[inferred]** Data element type that is convertible to the `value` type * of `InputIteratorT` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] reduction_op * Binary reduction functor * * @param[in] initial_value * Initial value of the reduction for each segment * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; return DispatchSegmentedReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream); } /** * @brief Computes a device-wide segmented sum using the addition (`+`) * operator. * * @par * - Uses `0` as the initial value of the reduction for each segment. * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `+` operators that are non-commutative. * - Let `s` be in `[0, num_segments)`. The range * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the sum reduction of a device vector of * `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedReduce::Sum( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sum-reduction * cub::DeviceSegmentedReduce::Sum( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // d_out <-- [21, 0, 17] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced aggregate * \iterator * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] stream * **[optional] CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The output value type using OutputT = cub::detail::non_void_value_t>; return DispatchSegmentedReduce< InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Sum>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::Sum(), OutputT(), // zero-initialize stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Computes a device-wide segmented minimum using the less-than * (`<`) operator. * * @par * - Uses `std::numeric_limits::max()` as the initial value of the * reduction for each segment. * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). * - Does not support `<` operators that are non-commutative. * - Let `s` be in `[0, num_segments)`. The range * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the min-reduction of a device vector of * `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedReduce::Min( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run min-reduction * cub::DeviceSegmentedReduce::Min( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // d_out <-- [6, INT_MAX, 0] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input value type using InputT = cub::detail::value_t; return DispatchSegmentedReduce< InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Min>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::Min(), Traits::Max(), // replace with // std::numeric_limits::max() // when C++11 support is more // prevalent stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Finds the first device-wide minimum in each segment using the * less-than ('<') operator, also returning the in-segment index of * that item. * * @par * - The output value type of `d_out` is cub::KeyValuePair `` * (assuming the value type of `d_in` is `T`) * - The minimum of the *i*th segment is written to * `d_out[i].value` and its offset in that segment is written to * `d_out[i].key`. * - The `{1, std::numeric_limits::max()}` tuple is produced for * zero-length inputs * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased for both * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). * - Does not support `<` operators that are non-commutative. * - Let `s` be in `[0, num_segments)`. The range * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the argmin-reduction of a device vector * of `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedReduce::ArgMin( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run argmin-reduction * cub::DeviceSegmentedReduce::ArgMin( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input items * (of some type `T`) \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `KeyValuePair`) \iterator * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the * *i*th is considered empty. * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; // The output value type using OutputValueT = typename OutputTupleT::Value; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value OutputTupleT initial_value(1, Traits::Max()); // replace with // std::numeric_limits::max() // when C++11 // support is // more prevalent return DispatchSegmentedReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::ArgMin(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Computes a device-wide segmented maximum using the greater-than * (`>`) operator. * * @par * - Uses `std::numeric_limits::lowest()` as the initial value of the * reduction. * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `>` operators that are non-commutative. * - Let `s` be in `[0, num_segments)`. The range * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the max-reduction of a device vector * of `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [-, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedReduce::Max( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run max-reduction * cub::DeviceSegmentedReduce::Max( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // d_out <-- [8, INT_MIN, 9] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input value type using InputT = cub::detail::value_t; return DispatchSegmentedReduce< InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Max>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::Max(), Traits::Lowest(), // replace with // std::numeric_limits::lowest() // when C++11 support is // more prevalent stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Finds the first device-wide maximum in each segment using the * greater-than ('>') operator, also returning the in-segment index of * that item * * @par * - The output value type of `d_out` is `cub::KeyValuePair` * (assuming the value type of `d_in` is `T`) * - The maximum of the *i*th segment is written to * `d_out[i].value` and its offset in that segment is written to * `d_out[i].key`. * - The `{1, std::numeric_limits::lowest()}` tuple is produced for * zero-length inputs * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `>` operators that are non-commutative. * - Let `s` be in `[0, num_segments)`. The range * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet * The code snippet below illustrates the argmax-reduction of a device vector * of `int` data elements. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedReduce::ArgMax( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run argmax-reduction * cub::DeviceSegmentedReduce::ArgMax( * d_temp_storage, temp_storage_bytes, d_in, d_out, * num_segments, d_offsets, d_offsets + 1); * * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input items * (of some type `T`) \iterator * * @tparam OutputIteratorT * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `KeyValuePair`) \iterator * * @tparam BeginOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { // Signed integer type for global offsets using OffsetT = int; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; // The output value type using OutputValueT = typename OutputTupleT::Value; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value, replace with std::numeric_limits::lowest() when C++11 // support is more prevalent OutputTupleT initial_value(1, Traits::Lowest()); return DispatchSegmentedReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::ArgMax(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_segmented_sort.cuh000066400000000000000000003757761434614775400214050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceSegmentedSort provides device-wide, parallel operations for * computing a batched sort across multiple, non-overlapping sequences of * data items residing within device-accessible memory. */ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceSegmentedSort provides device-wide, parallel operations for * computing a batched sort across multiple, non-overlapping sequences of * data items residing within device-accessible memory. * ![](segmented_sorting_logo.png) * @ingroup SegmentedModule * * @par Overview * The algorithm arranges items into ascending (or descending) order. * The underlying sorting algorithm is undefined. Depending on the segment size, * it might be radix sort, merge sort or something else. Therefore, no * assumptions on the underlying implementation should be made. * * @par Differences from DeviceSegmentedRadixSort * DeviceSegmentedRadixSort is optimized for significantly large segments (tens * of thousands of items and more). Nevertheless, some domains produce a wide * range of segment sizes. DeviceSegmentedSort partitions segments into size * groups and specialize sorting algorithms for each group. This approach leads * to better resource utilization in the presence of segment size imbalance or * moderate segment sizes (up to thousands of items). * This algorithm is more complex and consists of multiple kernels. This fact * leads to longer compilation times as well as larger binaries sizes. * * @par Supported Types * The algorithm has to satisfy the underlying algorithms restrictions. Radix * sort usage restricts the list of supported types. Therefore, * DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and * `__nv_bfloat16` 16-bit floating-point types. * * @par Segments are not required to be contiguous. Any element of input(s) or * output(s) outside the specified segments will not be accessed nor modified. * * @par A simple example * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] * @endcode */ struct DeviceSegmentedSort { /*************************************************************************//** * @name Keys-only ****************************************************************************/ //@{ /** * @brief Sorts segments of keys into ascending order. Approximately * `num_items + 2*num_segments` auxiliary storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - SortKeys is not guaranteed to be stable. That is, suppose that @p i and * @p j are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible * // pointers for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into descending order. Approximately * `num_items + 2*num_segments` auxiliary storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments + 1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets + 1`). * - SortKeysDescending is not guaranteed to be stable. That is, suppose that * @p i and @p j are equivalent: neither one is less than the other. It is * not guaranteed that the relative order of these two elements will be * preserved by sort. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no * work is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into ascending order. Approximately * `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - SortKeys is not guaranteed to be stable. That is, suppose that * @p i and @p j are equivalent: neither one is less than the other. It is * not guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible * // pointers for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no * work is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` * and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_values; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into descending order. Approximately * `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments + 1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets + 1`). * - SortKeysDescending is not guaranteed to be stable. That is, suppose that * @p i and @p j are equivalent: neither one is less than the other. It is * not guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1<= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_values; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into ascending order. Approximately * `num_items + 2*num_segments` auxiliary storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortKeys is stable: it preserves the relative ordering of * equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into descending order. Approximately * `num_items + 2*num_segments` auxiliary storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortKeysDescending is stable: it preserves the relative ordering of * equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into ascending order. Approximately * `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortKeys is stable: it preserves the relative ordering of * equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of keys into descending order. Approximately * `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within the DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortKeysDescending is stable: it preserves the relative ordering of * equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a DoubleBuffer to wrap the pair of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortKeysDescending( * d_temp_storage, temp_storage_bytes, d_keys, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that d_end_offsets[i]-1 is the last * element of the ith data segment in `d_keys_*` and * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the * i-th segment is considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //@} end member group /*************************************************************************//** * @name Key-value pairs ****************************************************************************/ //@{ /** * @brief Sorts segments of key-value pairs into ascending order. * Approximately `2*num_items + 2*num_segments` auxiliary storage * required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and * @p j are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] d_values_in * Device-accessible pointer to the corresponding input sequence of * associated value items * * @param[out] d_values_out * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into descending order. Approximately * `2*num_items + 2*num_segments` auxiliary storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and * @p j are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] d_values_in * Device-accessible pointer to the corresponding input sequence of * associated value items * * @param[out] d_values_out * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into ascending order. * Approximately `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the sorting * operation. * - Upon completion, the sorting operation will update the "current" indicator * within each DoubleBuffer wrapper to reference which of the two buffers * now contains the sorted output sequence (a function of the number of key bits * specified and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and * @p j are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] * * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer contains * the unsorted input values and, upon return, is updated to point to the * sorted output values * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into descending order. * Approximately `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the * sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within each DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits specified and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length num_segments+1) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as segment_offsets+1). * - SortPairsDescending is not guaranteed to be stable. That is, suppose that * @p i and @p j are equivalent: neither one is less than the other. It is * not guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::SortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] * * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer contains * the unsorted input values and, upon return, is updated to point to the * sorted output values * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; return DispatchT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into ascending order. Approximately * `2*num_items + 2*num_segments` auxiliary storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortPairs is stable: it preserves the relative ordering of * equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When nullptr, the * required allocation size is written to @p temp_storage_bytes and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] d_values_in * Device-accessible pointer to the corresponding input sequence of * associated value items * * @param[out] d_values_out * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into descending order. * Approximately `2*num_items + 2*num_segments` auxiliary * storage required. * * @par * - The contents of the input data are not altered by the sorting operation. * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortPairsDescending is stable: it preserves the relative ordering * of equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_values_out; // e.g., [-, -, -, -, -, -, -] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortPairsDescending( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_keys_out, d_values_in, d_values_out, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * * @param[in] d_values_in * Device-accessible pointer to the corresponding input sequence of * associated value items * * @param[out] d_values_out * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into ascending order. * Approximately `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the * sorting operation. * - Upon completion, the sorting operation will update the "current" * indicator within each DoubleBuffer wrapper to reference which of the two * buffers now contains the sorted output sequence (a function of the number * of key bits specified and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortPairs is stable: it preserves the relative ordering * of equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] * * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer contains * the unsorted input values and, upon return, is updated to point to the * sorted output values * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } /** * @brief Sorts segments of key-value pairs into descending order. * Approximately `2*num_segments` auxiliary storage required. * * @par * - The sorting operation is given a pair of key buffers and a corresponding * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). * - The contents of both buffers within each pair may be altered by the sorting * operation. * - Upon completion, the sorting operation will update the "current" indicator * within each DoubleBuffer wrapper to reference which of the two buffers * now contains the sorted output sequence (a function of the number of key bits * specified and the targeted device architecture). * - When the input is a contiguous sequence of segments, a single sequence * @p segment_offsets (of length `num_segments+1`) can be aliased * for both the @p d_begin_offsets and @p d_end_offsets parameters (where * the latter is specified as `segment_offsets+1`). * - StableSortPairsDescending is stable: it preserves the relative ordering * of equivalent elements. That is, if @p x and @p y are elements such that * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. * - Segments are not required to be contiguous. For all index values `i` * outside the specified segments `d_keys.Current()[i]`, * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of @p int keys with associated vector of * @p int values. * * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] * ... * * // Create a set of DoubleBuffers to wrap pairs of device pointers * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSegmentedSort::StableSortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceSegmentedSort::StableSortPairsDescending( * d_temp_storage, temp_storage_bytes, d_keys, d_values, * num_items, num_segments, d_offsets, d_offsets + 1); * * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * * @tparam KeyT * [inferred] Key type * * @tparam ValueT * [inferred] Value type * * @tparam BeginOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * beginning offsets \iterator * * @tparam EndOffsetIteratorT * [inferred] Random-access input iterator type for reading segment * ending offsets \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done * * @param[in,out] temp_storage_bytes * Reference to size in bytes of @p d_temp_storage allocation * * @param[in,out] d_keys * Reference to the double-buffer of keys whose "current" device-accessible * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * * @param[in,out] d_values * Double-buffer of values whose "current" device-accessible buffer contains * the unsorted input values and, upon return, is updated to point to the * sorted output values * * @param[in] num_items * The total number of items to sort (across all segments) * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length @p num_segments, such that `d_begin_offsets[i]` is the first * element of the ith data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of * the ith data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. * * @param[in] stream * [optional] CUDA stream to launch kernels within. Default is * stream0. */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { return SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/device/device_select.cuh000066400000000000000000001177201434614775400176210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceSelect provides device-wide, parallel operations for * compacting selected items from sequences of data items residing within * device-accessible memory. */ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceSelect provides device-wide, parallel operations for compacting * selected items from sequences of data items residing within * device-accessible memory. ![](select_logo.png) * @ingroup SingleModule * * @par Overview * These operations apply a selection criterion to selectively copy * items from a specified input sequence to a compact output sequence. * * @par Usage Considerations * @cdp_class{DeviceSelect} * * @par Performance * @linear_performance{select-flagged, select-if, and select-unique} * * @par * The following chart illustrates DeviceSelect::If performance across * different CUDA architectures for `int32` items, where 50% of the items are * randomly selected. * * @image html select_if_int32_50_percent.png * * @par * The following chart illustrates DeviceSelect::Unique performance across * different CUDA architectures for `int32` items where segments have lengths * uniformly sampled from `[1, 1000]`. * * @image html select_unique_int32_len_500.png * * @par * @plots_below * */ struct DeviceSelect { /** * @brief Uses the `d_flags` sequence to selectively copy the corresponding * items from `d_in` into `d_out`. The total number of items selected * is written to `d_num_selected_out`. ![](select_flags_logo.png) * * @par * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, * `char`, `int`, etc.). * - Copies of the selected items are compacted into `d_out` and maintain * their original relative ordering. * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap * `[d_in, d_in + num_items)`, `[d_flags, d_flags + num_items)` nor * `d_num_selected_out` in any way. * - @devicestorage * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input, * // flags, and output * int num_items; // e.g., 8 * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] * int *d_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::Flagged( * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_out, d_num_selected_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DeviceSelect::Flagged( * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_out, d_num_selected_out, num_items); * * // d_out <-- [1, 4, 6, 7] * // d_num_selected_out <-- [4] * * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam FlagIterator * **[inferred]** Random-access input iterator type for reading selection * flags \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing selected * items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[in] d_flags * Pointer to the input sequence of selection flags * * @param[out] d_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the output total number of items selected * (i.e., length of `d_out`) * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Flagged(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) return DispatchSelectIf::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, SelectOp(), EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Flagged(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } /** * @brief Uses the `d_flags` sequence to selectively compact the items in * `d_data`. The total number of items selected is written to * `d_num_selected_out`. ![](select_flags_logo.png) * * @par * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, * `char`, `int`, etc.). * - Copies of the selected items are compacted in-place and maintain * their original relative ordering. * - The `d_data` may equal `d_flags`. The range * `[d_data, d_data + num_items)` shall not overlap * `[d_flags, d_flags + num_items)` in any other way. * - @devicestorage * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input, * // flags, and output * int num_items; // e.g., 8 * int *d_data; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] * int *d_num_selected_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::Flagged( * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_num_selected_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DeviceSelect::Flagged( * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_num_selected_out, num_items); * * // d_data <-- [1, 4, 6, 7] * // d_num_selected_out <-- [4] * * @endcode * * @tparam IteratorT * **[inferred]** Random-access iterator type for reading and writing * selected items \iterator * * @tparam FlagIterator * **[inferred]** Random-access input iterator type for reading selection * flags \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Pointer to the sequence of data items * * @param[in] d_flags * Pointer to the input sequence of selection flags * * @param[out] d_num_selected_out * Pointer to the output total number of items selected * * @param[in] num_items * Total number of input items (i.e., length of `d_data`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Flagged(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) constexpr bool may_alias = true; return DispatchSelectIf::Dispatch(d_temp_storage, temp_storage_bytes, d_data, // in d_flags, d_data, // out d_num_selected_out, SelectOp(), EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Flagged(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Flagged( d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream); } /** * @brief Uses the `select_op` functor to selectively copy items from `d_in` * into `d_out`. The total number of items selected is written to * `d_num_selected_out`. ![](select_logo.png) * * @par * - Copies of the selected items are compacted into `d_out` and maintain * their original relative ordering. * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap * `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way. * - @devicestorage * * @par Performance * The following charts illustrate saturated select-if performance across * different CUDA architectures for `int32` and `int64` items, respectively. * Items are selected with 50% probability. * * @image html select_if_int32_50_percent.png * @image html select_if_int64_50_percent.png * * @par * The following charts are similar, but 5% selection probability: * * @image html select_if_int32_5_percent.png * @image html select_if_int64_5_percent.png * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * * // Functor type for selecting values less than some criteria * struct LessThan * { * int compare; * * CUB_RUNTIME_FUNCTION __forceinline__ * LessThan(int compare) : compare(compare) {} * * CUB_RUNTIME_FUNCTION __forceinline__ * bool operator()(const int &a) const { * return (a < compare); * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] * int *d_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ ] * LessThan select_op(7); * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::If( * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items, select_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DeviceSelect::If( * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items, select_op); * * // d_out <-- [0, 2, 3, 5, 2] * // d_num_selected_out <-- [5] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing selected * items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @tparam SelectOp * **[inferred]** Selection operator type having member * `bool operator()(const T &a)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the output total number of items selected * (i.e., length of `d_out`) * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] select_op * Unary selection operator * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType *; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) return DispatchSelectIf::Dispatch(d_temp_storage, temp_storage_bytes, d_in, NULL, d_out, d_num_selected_out, select_op, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } /** * @brief Uses the `select_op` functor to selectively compact items in * `d_data`. The total number of items selected is written to * `d_num_selected_out`. ![](select_logo.png) * * @par * - Copies of the selected items are compacted in `d_data` and maintain * their original relative ordering. * - @devicestorage * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * * // Functor type for selecting values less than some criteria * struct LessThan * { * int compare; * * CUB_RUNTIME_FUNCTION __forceinline__ * LessThan(int compare) : compare(compare) {} * * CUB_RUNTIME_FUNCTION __forceinline__ * bool operator()(const int &a) const { * return (a < compare); * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_data; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] * int *d_num_selected_out; // e.g., [ ] * LessThan select_op(7); * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::If( * d_temp_storage, temp_storage_bytes, * d_data, d_num_selected_out, num_items, select_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DeviceSelect::If( * d_temp_storage, temp_storage_bytes, * d_data, d_num_selected_out, num_items, select_op); * * // d_data <-- [0, 2, 3, 5, 2] * // d_num_selected_out <-- [5] * @endcode * * @tparam IteratorT * **[inferred]** Random-access input iterator type for reading and * writing items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @tparam SelectOp * **[inferred]** Selection operator type having member * `bool operator()(const T &a)` * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Pointer to the sequence of data items * * @param[out] d_num_selected_out * Pointer to the output total number of items selected * * @param[in] num_items * Total number of input items (i.e., length of `d_data`) * * @param[in] select_op * Unary selection operator * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType *; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) constexpr bool may_alias = true; return DispatchSelectIf::Dispatch(d_temp_storage, temp_storage_bytes, d_data, // in NULL, d_data, // out d_num_selected_out, select_op, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t If(void *d_temp_storage, size_t &temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If(d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream); } /** * @brief Given an input sequence `d_in` having runs of consecutive * equal-valued keys, only the first key from each run is selectively * copied to `d_out`. The total number of items selected is written to * `d_num_selected_out`. ![](unique_logo.png) * * @par * - The `==` equality operator is used to determine whether keys are * equivalent * - Copies of the selected items are compacted into `d_out` and maintain * their original relative ordering. * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap * `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way. * - @devicestorage * * @par Performance * The following charts illustrate saturated select-unique performance across different * CUDA architectures for `int32` and `int64` items, respectively. Segments * have lengths uniformly sampled from `[1, 1000]`. * * @image html select_unique_int32_len_500.png * @image html select_unique_int64_len_500.png * * @par * The following charts are similar, but with segment lengths uniformly * sampled from `[1, 10]`: * * @image html select_unique_int32_len_5.png * @image html select_unique_int64_len_5.png * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] * int *d_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::Unique( * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DeviceSelect::Unique( * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items); * * // d_out <-- [0, 2, 9, 5, 8] * // d_num_selected_out <-- [5] * @endcode * * @tparam InputIteratorT * **[inferred]** Random-access input iterator type for reading input * items \iterator * * @tparam OutputIteratorT * **[inferred]** Random-access output iterator type for writing selected * items \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the output total number of items selected * (i.e., length of `d_out`) * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Unique(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType *; // FlagT iterator type (not used) using SelectOp = NullType; // Selection op (not used) using EqualityOp = Equality; // Default == operator return DispatchSelectIf::Dispatch(d_temp_storage, temp_storage_bytes, d_in, NULL, d_out, d_num_selected_out, SelectOp(), EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Unique(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Unique( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); } /** * @brief Given an input sequence `d_keys_in` and `d_values_in` with runs of * key-value pairs with consecutive equal-valued keys, only the first * key and its value from each run is selectively copied to * `d_keys_out` and `d_values_out`. The total number of items selected * is written to `d_num_selected_out`. ![](unique_logo.png) * * @par * - The `==` equality operator is used to determine whether keys are * equivalent * - Copies of the selected items are compacted into `d_out` and maintain * their original relative ordering. * - In-place operations are not supported. There must be no overlap between * any of the provided ranges: * - `[d_keys_in, d_keys_in + num_items)` * - `[d_keys_out, d_keys_out + *d_num_selected_out)` * - `[d_values_in, d_values_in + num_items)` * - `[d_values_out, d_values_out + *d_num_selected_out)` * - `[d_num_selected_out, d_num_selected_out + 1)` * - @devicestorage * * @par Snippet * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] * int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] * int *d_keys_out; // e.g., [ , , , , , , , ] * int *d_values_out; // e.g., [ , , , , , , , ] * int *d_num_selected_out; // e.g., [ ] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::UniqueByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, * d_keys_out, d_values_out, d_num_selected_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run selection * cub::DeviceSelect::UniqueByKey( * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, * d_keys_out, d_values_out, d_num_selected_out, num_items); * * // d_keys_out <-- [0, 2, 9, 5, 8] * // d_values_out <-- [1, 2, 4, 5, 8] * // d_num_selected_out <-- [5] * @endcode * * @tparam KeyInputIteratorT * **[inferred]** Random-access input iterator type for reading input * keys \iterator * * @tparam ValueInputIteratorT * **[inferred]** Random-access input iterator type for reading input * values \iterator * * @tparam KeyOutputIteratorT * **[inferred]** Random-access output iterator type for writing selected * keys \iterator * * @tparam ValueOutputIteratorT * **[inferred]** Random-access output iterator type for writing selected * values \iterator * * @tparam NumSelectedIteratorT * **[inferred]** Output iterator type for recording the number of items * selected \iterator * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[in] d_values_in * Pointer to the input sequence of values * * @param[out] d_keys_out * Pointer to the output sequence of selected keys * * @param[out] d_values_out * Pointer to the output sequence of selected values * * @param[out] d_num_selected_out * Pointer to the total number of items selected (i.e., length of * `d_keys_out` or `d_values_out`) * * @param[in] num_items * Total number of input items (i.e., length of `d_keys_in` or * `d_values_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; using EqualityOp = Equality; return DispatchUniqueByKey::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, stream); } }; /** * @example example_device_select_flagged.cu * @example example_device_select_if.cu * @example example_device_select_unique.cu */ CUB_NAMESPACE_END cub-2.0.1/cub/device/device_spmv.cuh000066400000000000000000000224541434614775400173260ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). */ #pragma once #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). * \ingroup SingleModule * * \par Overview * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) * performs the matrix-vector operation * y = A*x + y, * where: * - A is an mxn sparse matrix whose non-zero structure is specified in * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) * (i.e., three arrays: values, row_offsets, and column_indices) * - x and y are dense vectors * * \par Usage Considerations * \cdp_class{DeviceSpmv} * */ struct DeviceSpmv { /******************************************************************//** * \name CSR matrix operations *********************************************************************/ //@{ /** * \brief This function performs the matrix-vector operation y = A*x. * * \par Snippet * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A * representing a 3x3 lattice (24 non-zeros). * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, * // and output vector y * int num_rows = 9; * int num_cols = 9; * int num_nonzeros = 24; * * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, * // 1, 1, 1, 1, 1, 1, 1, 1, * // 1, 1, 1, 1, 1, 1, 1, 1] * * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, * // 4, 6, 1, 3, 5, 7, 2, 4, * // 8, 3, 7, 4, 6, 8, 5, 7] * * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] * * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] * float* d_vector_y; // e.g., [ , , , , , , , , ] * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, * num_rows, num_cols, num_nonzeros); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run SpMV * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, * num_rows, num_cols, num_nonzeros); * * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] * * \endcode * * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) */ template < typename ValueT> CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation const ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. const int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) const int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) const ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y int num_rows, ///< [in] number of rows of matrix A. int num_cols, ///< [in] number of columns of matrix A. int num_nonzeros, ///< [in] number of nonzero elements of matrix A. cudaStream_t stream = 0) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. { SpmvParams spmv_params; spmv_params.d_values = d_values; spmv_params.d_row_end_offsets = d_row_offsets + 1; spmv_params.d_column_indices = d_column_indices; spmv_params.d_vector_x = d_vector_x; spmv_params.d_vector_y = d_vector_y; spmv_params.num_rows = num_rows; spmv_params.num_cols = num_cols; spmv_params.num_nonzeros = num_nonzeros; spmv_params.alpha = ValueT{1}; spmv_params.beta = ValueT{0}; return DispatchSpmv::Dispatch( d_temp_storage, temp_storage_bytes, spmv_params, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(void *d_temp_storage, size_t &temp_storage_bytes, const ValueT *d_values, const int *d_row_offsets, const int *d_column_indices, const ValueT *d_vector_x, ValueT *d_vector_y, int num_rows, int num_cols, int num_nonzeros, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return CsrMV(d_temp_storage, temp_storage_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, num_rows, num_cols, num_nonzeros, stream); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/000077500000000000000000000000001434614775400161115ustar00rootroot00000000000000cub-2.0.1/cub/device/dispatch/dispatch_adjacent_difference.cuh000066400000000000000000000337621434614775400244270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT *result, OffsetT num_tiles, int items_per_tile) { const int tile_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile); } template void __global__ DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input, InputT *first_tile_previous, OutputIteratorT result, DifferenceOpT difference_op, OffsetT num_items) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy; // It is OK to introspect the return type or parameter types of the // `operator()` function of `__device__` extended lambda within device code. using OutputT = detail::invoke_result_t; using Agent = AgentDifference; __shared__ typename Agent::TempStorage storage; Agent agent(storage, input, first_tile_previous, result, difference_op, num_items); int tile_idx = static_cast(blockIdx.x); OffsetT tile_base = static_cast(tile_idx) * ActivePolicyT::ITEMS_PER_TILE; agent.Process(tile_idx, tile_base); } template struct DeviceAdjacentDifferencePolicy { using ValueT = typename std::iterator_traits::value_type; //------------------------------------------------------------------------------ // Architecture-specific tuning policies //------------------------------------------------------------------------------ struct Policy300 : ChainedPolicy<300, Policy300, Policy300> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, Nominal8BItemsToItems(7), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE>; }; struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, Nominal8BItemsToItems(7), BLOCK_LOAD_WARP_TRANSPOSE, MayAlias ? LOAD_CA : LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE>; }; using MaxPolicy = Policy350; }; template > struct DispatchAdjacentDifference : public SelectedPolicy { using InputT = typename std::iterator_traits::value_type; void *d_temp_storage; std::size_t &temp_storage_bytes; InputIteratorT d_input; OutputIteratorT d_output; OffsetT num_items; DifferenceOpT difference_op; cudaStream_t stream; CUB_RUNTIME_FUNCTION __forceinline__ DispatchAdjacentDifference(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input(d_input) , d_output(d_output) , num_items(num_items) , difference_op(difference_op) , stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_DEPRECATED CUB_RUNTIME_FUNCTION __forceinline__ DispatchAdjacentDifference(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input(d_input) , d_output(d_output) , num_items(num_items) , difference_op(difference_op) , stream(stream) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { using AdjacentDifferencePolicyT = typename ActivePolicyT::AdjacentDifferencePolicy; using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; cudaError error = cudaSuccess; do { const int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE; const int num_tiles = static_cast(DivideAndRoundUp(num_items, tile_size)); std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT); void *allocations[1] = {nullptr}; std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation if (temp_storage_bytes == 0) { temp_storage_bytes = 1; } break; } if (num_items == OffsetT{}) { break; } auto first_tile_previous = reinterpret_cast(allocations[0]); if (MayAlias) { using AgentDifferenceInitT = AgentDifferenceInit; const int init_block_size = AgentDifferenceInitT::BLOCK_THREADS; const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceAdjacentDifferenceInitKernel" "<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_block_size, reinterpret_cast(stream)); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) .doit(DeviceAdjacentDifferenceInitKernel, d_input, first_tile_previous, num_tiles, tile_size); error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } } #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel" "<<<%d, %d, 0, %lld>>>()\n", num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, reinterpret_cast(stream)); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) .doit(DeviceAdjacentDifferenceDifferenceKernel, d_input, first_tile_previous, d_output, difference_op, num_items); error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } } while (0); return error; } CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream) { using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchAdjacentDifference dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_histogram.cuh000066400000000000000000001761721434614775400225040ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Histogram kernel entry points *****************************************************************************/ /** * Histogram initialization kernel entry point */ template < int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed typename CounterT, ///< Integer type for counting sample occurrences per histogram bin typename OffsetT> ///< Signed integer type for global offsets __global__ void DeviceHistogramInitKernel( ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks { if ((threadIdx.x == 0) && (blockIdx.x == 0)) tile_queue.ResetDrain(); int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { if (output_bin < num_output_bins_wrapper.array[CHANNEL]) d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; } } /** * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. */ template < typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed typename SampleIteratorT, ///< The input iterator type. \iterator. typename CounterT, ///< Integer type for counting sample occurrences per histogram bin typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel( SampleIteratorT d_samples, ///< Input data to reduce ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< The number of rows in the region of interest OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest int tiles_per_row, ///< Number of image tiles per row GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks { // Thread block type for compositing input tiles typedef AgentHistogram< AgentHistogramPolicyT, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT> AgentHistogramT; // Shared memory for AgentHistogram __shared__ typename AgentHistogramT::TempStorage temp_storage; AgentHistogramT agent( temp_storage, d_samples, num_output_bins_wrapper.array, num_privatized_bins_wrapper.array, d_output_histograms_wrapper.array, d_privatized_histograms_wrapper.array, output_decode_op_wrapper.array, privatized_decode_op_wrapper.array); // Initialize counters agent.InitBinCounters(); // Consume input tiles agent.ConsumeTiles( num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue); // Store output to global (if necessary) agent.StoreOutput(); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram */ template < int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator typename CounterT, ///< Integer type for counting sample occurrences per histogram bin typename LevelT, ///< Type for specifying bin level boundaries typename OffsetT> ///< Signed integer type for global offsets struct DispatchHistogram { public: //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The sample value type of the input iterator using SampleT = cub::detail::value_t; enum { // Maximum number of bins per channel for which we will use a privatized smem strategy MAX_PRIVATIZED_SMEM_BINS = 256 }; //--------------------------------------------------------------------- // Transform functors for converting samples to bin-ids //--------------------------------------------------------------------- // Searches for bin given a list of bin-boundary levels template struct SearchTransform { LevelIteratorT d_levels; // Pointer to levels array int num_output_levels; // Number of levels in array // Initializer __host__ __device__ __forceinline__ void Init( LevelIteratorT d_levels_, // Pointer to levels array int num_output_levels_) // Number of levels in array { this->d_levels = d_levels_; this->num_output_levels = num_output_levels_; } // Method for converting samples to bin-ids template __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) { /// Level iterator wrapper type // Wrap the native input pointer with CacheModifiedInputIterator // or Directly use the supplied input iterator type using WrappedLevelIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, LevelIteratorT>; WrappedLevelIteratorT wrapped_levels(d_levels); int num_bins = num_output_levels - 1; if (valid) { bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; if (bin >= num_bins) bin = -1; } } }; // Scales samples to evenly-spaced bins struct ScaleTransform { private: using CommonT = typename cuda::std::common_type::type; static_assert(cuda::std::is_convertible::value, "The common type of `LevelT` and `SampleT` must be " "convertible to `int`."); static_assert(cuda::std::is_trivially_copyable::value, "The common type of `LevelT` and `SampleT` must be " "trivially copyable."); union ScaleT { // Used when CommonT is not floating-point to avoid intermediate // rounding errors (see NVIDIA/cub#489). struct FractionT { CommonT bins; CommonT range; } fraction; // Used when CommonT is floating-point as an optimization. CommonT reciprocal; }; CommonT m_max; // Max sample level (exclusive) CommonT m_min; // Min sample level (inclusive) ScaleT m_scale; // Bin scaling template __host__ __device__ __forceinline__ ScaleT ComputeScale(int num_levels, T max_level, T min_level, cuda::std::true_type /* is_fp */) { ScaleT result; result.reciprocal = static_cast(static_cast(num_levels - 1) / static_cast(max_level - min_level)); return result; } template __host__ __device__ __forceinline__ ScaleT ComputeScale(int num_levels, T max_level, T min_level, cuda::std::false_type /* is_fp */) { ScaleT result; result.fraction.bins = static_cast(num_levels - 1); result.fraction.range = static_cast(max_level - min_level); return result; } template __host__ __device__ __forceinline__ ScaleT ComputeScale(int num_levels, T max_level, T min_level) { return this->ComputeScale(num_levels, max_level, min_level, cuda::std::is_floating_point{}); } #ifdef __CUDA_FP16_TYPES_EXIST__ __host__ __device__ __forceinline__ ScaleT ComputeScale(int num_levels, __half max_level, __half min_level) { NV_IF_TARGET(NV_PROVIDES_SM_53, (return this->ComputeScale(num_levels, max_level, min_level, cuda::std::true_type{});), (return this->ComputeScale(num_levels, __half2float(max_level), __half2float(min_level), cuda::std::true_type{});)); } #endif // All types but __half: template __host__ __device__ __forceinline__ int SampleIsValid(T sample, T max_level, T min_level) { return sample >= min_level && sample < max_level; } #ifdef __CUDA_FP16_TYPES_EXIST__ __host__ __device__ __forceinline__ int SampleIsValid(__half sample, __half max_level, __half min_level) { NV_IF_TARGET(NV_PROVIDES_SM_53, (return sample >= min_level && sample < max_level;), (return this->SampleIsValid(__half2float(sample), __half2float(max_level), __half2float(min_level));)); } #endif template __host__ __device__ __forceinline__ int ComputeBin(T sample, T min_level, ScaleT scale, cuda::std::true_type /* is_fp */) { return static_cast((sample - min_level) * scale.reciprocal); } template __host__ __device__ __forceinline__ int ComputeBin(T sample, T min_level, ScaleT scale, cuda::std::false_type /* is_fp */) { return static_cast(((sample - min_level) * scale.fraction.bins) / scale.fraction.range); } template __host__ __device__ __forceinline__ int ComputeBin(T sample, T min_level, ScaleT scale) { return this->ComputeBin(sample, min_level, scale, cuda::std::is_floating_point{}); } #ifdef __CUDA_FP16_TYPES_EXIST__ __host__ __device__ __forceinline__ int ComputeBin(__half sample, __half min_level, ScaleT scale) { NV_IF_TARGET(NV_PROVIDES_SM_53, (return this->ComputeBin(sample, min_level, scale, cuda::std::true_type{});), (return static_cast((__half2float(sample) - __half2float(min_level)) * __half2float(scale.reciprocal));)); } #endif public: // Initializer __host__ __device__ __forceinline__ void Init(int num_levels, LevelT max_level, LevelT min_level) { m_max = static_cast(max_level); m_min = static_cast(min_level); m_scale = this->ComputeScale(num_levels, m_max, m_min); } // Method for converting samples to bin-ids template __host__ __device__ __forceinline__ void BinSelect(SampleT sample, int &bin, bool valid) { const CommonT common_sample = static_cast(sample); if (valid && this->SampleIsValid(common_sample, m_max, m_min)) { bin = this->ComputeBin(common_sample, m_min, m_scale); } } }; // Pass-through bin transform operator struct PassThruTransform { // Method for converting samples to bin-ids template __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) { if (valid) bin = (int) sample; } }; //--------------------------------------------------------------------- // Tuning policies //--------------------------------------------------------------------- template struct TScale { enum { V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) }; }; /// SM35 struct Policy350 { // HistogramSweepPolicy typedef AgentHistogramPolicy< 128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true> HistogramSweepPolicy; }; /// SM50 struct Policy500 { // HistogramSweepPolicy typedef AgentHistogramPolicy< 384, TScale<16>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false> HistogramSweepPolicy; }; //--------------------------------------------------------------------- // Tuning policies of current PTX compiler pass //--------------------------------------------------------------------- #if (CUB_PTX_ARCH >= 500) typedef Policy500 PtxPolicy; #else typedef Policy350 PtxPolicy; #endif // "Opaque" policies (whose parameterizations aren't reflected in the type signature) struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; //--------------------------------------------------------------------- // Utilities //--------------------------------------------------------------------- /** * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t InitConfigs( int ptx_version, KernelConfig &histogram_sweep_config) { cudaError_t result = cudaErrorNotSupported; NV_IF_TARGET( NV_IS_DEVICE, ( // We're on the device, so initialize the kernel dispatch // configurations with the current PTX policy result = histogram_sweep_config.template Init(); ), ( // NV_IS_HOST: // We're on the host, so lookup and initialize the kernel dispatch // configurations with the policies that match the device's PTX // version if (ptx_version >= 500) { result = histogram_sweep_config.template Init(); } else { result = histogram_sweep_config.template Init(); } )); return result; } /** * Kernel kernel dispatch configuration */ struct KernelConfig { int block_threads; int pixels_per_thread; template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Init() { block_threads = BlockPolicy::BLOCK_THREADS; pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; return cudaSuccess; } }; //--------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------- /** * Privatization-based dispatch routine */ template < typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t PrivatizedDispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel int max_num_output_bins, ///< [in] Maximum number of output bins in any channel OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for cudaStream_t stream) ///< [in] CUDA stream to launch kernels within. Default is stream0. { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Get SM count int sm_count; if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; // Get SM occupancy for histogram_sweep_kernel int histogram_sweep_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( histogram_sweep_sm_occupancy, histogram_sweep_kernel, histogram_sweep_config.block_threads))) break; // Get device occupancy for histogram_sweep_kernel int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; if (num_row_pixels * NUM_CHANNELS == row_stride_samples) { // Treat as a single linear array of samples num_row_pixels *= num_rows; num_rows = 1; row_stride_samples = num_row_pixels * NUM_CHANNELS; } // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; int tiles_per_row = static_cast(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile)); int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); int blocks_per_col = (blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0; int num_thread_blocks = blocks_per_row * blocks_per_col; dim3 sweep_grid_dims; sweep_grid_dims.x = (unsigned int) blocks_per_row; sweep_grid_dims.y = (unsigned int) blocks_per_col; sweep_grid_dims.z = 1; // Temporary storage allocation requirements const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; void* allocations[NUM_ALLOCATIONS] = {}; size_t allocation_sizes[NUM_ALLOCATIONS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the grid queue descriptor GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) ArrayWrapper d_output_histograms_wrapper; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) ArrayWrapper d_privatized_histograms_wrapper; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) ArrayWrapper privatized_decode_op_wrapper; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) ArrayWrapper output_decode_op_wrapper; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) ArrayWrapper num_privatized_bins_wrapper; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) ArrayWrapper num_output_bins_wrapper; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; int histogram_init_block_threads = 256; int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; // Log DeviceHistogramInitKernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); #endif // Invoke histogram_init_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( histogram_init_grid_dims, histogram_init_block_threads, 0, stream ).doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue); // Return if empty problem if ((blocks_per_row == 0) || (blocks_per_col == 0)) break; // Log histogram_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); #endif // Invoke histogram_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream ).doit(histogram_sweep_kernel, d_samples, num_output_bins_wrapper, num_privatized_bins_wrapper, d_output_histograms_wrapper, d_privatized_histograms_wrapper, output_decode_op_wrapper, privatized_decode_op_wrapper, num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t PrivatizedDispatch( void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_privatized_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, histogram_init_kernel, histogram_sweep_kernel, histogram_sweep_config, stream); } /** * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit */ CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. Int2Type /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel dispatch configurations KernelConfig histogram_sweep_config; if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) break; // Use the search transform op for converting samples to privatized bins typedef SearchTransform PrivatizedDecodeOpT; // Use the pass-thru transform op for converting privatized bins to output bins typedef PassThruTransform OutputDecodeOpT; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); if (num_output_levels[channel] > max_levels) max_levels = num_output_levels[channel]; } int max_num_output_bins = max_levels - 1; // Dispatch if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) { // Too many bins to keep in shared memory. const int PRIVATIZED_SMEM_BINS = 0; if (CubDebug(error = PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, DeviceHistogramInitKernel, DeviceHistogramSweepKernel, histogram_sweep_config, stream))) break; } else { // Dispatch shared-privatized approach const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; if (CubDebug(error = PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, DeviceHistogramInitKernel, DeviceHistogramSweepKernel, histogram_sweep_config, stream))) break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchRange(d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } /** * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) */ CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. Int2Type /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel dispatch configurations KernelConfig histogram_sweep_config; if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) break; // Use the pass-thru transform op for converting samples to privatized bins typedef PassThruTransform PrivatizedDecodeOpT; // Use the search transform op for converting privatized bins to output bins typedef SearchTransform OutputDecodeOpT; int num_privatized_levels[NUM_ACTIVE_CHANNELS]; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; int max_levels = num_output_levels[0]; // Maximum number of levels in any channel for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { num_privatized_levels[channel] = 257; output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); if (num_output_levels[channel] > max_levels) max_levels = num_output_levels[channel]; } int max_num_output_bins = max_levels - 1; const int PRIVATIZED_SMEM_BINS = 256; if (CubDebug(error = PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_privatized_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, DeviceHistogramInitKernel, DeviceHistogramSweepKernel, histogram_sweep_config, stream))) break; } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchRange(d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } /** * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t DispatchEven( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. Int2Type /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel dispatch configurations KernelConfig histogram_sweep_config; if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) break; // Use the scale transform op for converting samples to privatized bins typedef ScaleTransform PrivatizedDecodeOpT; // Use the pass-thru transform op for converting privatized bins to output bins typedef PassThruTransform OutputDecodeOpT; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel]); if (num_output_levels[channel] > max_levels) max_levels = num_output_levels[channel]; } int max_num_output_bins = max_levels - 1; if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) { // Dispatch shared-privatized approach const int PRIVATIZED_SMEM_BINS = 0; if (CubDebug(error = PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, DeviceHistogramInitKernel, DeviceHistogramSweepKernel, histogram_sweep_config, stream))) break; } else { // Dispatch shared-privatized approach const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; if (CubDebug(error = PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, DeviceHistogramInitKernel, DeviceHistogramSweepKernel, histogram_sweep_config, stream))) break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchEven(d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } /** * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t DispatchEven( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. Int2Type /*is_byte_sample*/) ///< [in] Marker type indicating whether or not SampleT is a 8b type { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel dispatch configurations KernelConfig histogram_sweep_config; if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) break; // Use the pass-thru transform op for converting samples to privatized bins typedef PassThruTransform PrivatizedDecodeOpT; // Use the scale transform op for converting privatized bins to output bins typedef ScaleTransform OutputDecodeOpT; int num_privatized_levels[NUM_ACTIVE_CHANNELS]; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { num_privatized_levels[channel] = 257; output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel]); if (num_output_levels[channel] > max_levels) max_levels = num_output_levels[channel]; } int max_num_output_bins = max_levels - 1; const int PRIVATIZED_SMEM_BINS = 256; if (CubDebug(error = PrivatizedDispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_privatized_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, DeviceHistogramInitKernel, DeviceHistogramSweepKernel, histogram_sweep_config, stream))) break; } while (0); return error; } CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchEven(d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_merge_sort.cuh000066400000000000000000000763071434614775400226540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) DeviceMergeSortBlockSortKernel(bool ping, KeyInputIteratorT keys_in, ValueInputIteratorT items_in, KeyIteratorT keys_out, ValueIteratorT items_out, OffsetT keys_count, KeyT *tmp_keys_out, ValueT *tmp_items_out, CompareOpT compare_op, char *vshmem) { extern __shared__ char shmem[]; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy; using AgentBlockSortT = AgentBlockSort; const OffsetT vshmem_offset = blockIdx.x * AgentBlockSortT::SHARED_MEMORY_SIZE; typename AgentBlockSortT::TempStorage &storage = *reinterpret_cast( UseVShmem ? vshmem + vshmem_offset : shmem); AgentBlockSortT agent(ping, storage, THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), keys_count, keys_out, items_out, tmp_keys_out, tmp_items_out, compare_op); agent.Process(); } template __global__ void DeviceMergeSortPartitionKernel(bool ping, KeyIteratorT keys_ping, KeyT *keys_pong, OffsetT keys_count, OffsetT num_partitions, OffsetT *merge_partitions, CompareOpT compare_op, OffsetT target_merged_tiles_number, int items_per_tile) { OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x; if (partition_idx < num_partitions) { AgentPartition agent( ping, keys_ping, keys_pong, keys_count, partition_idx, merge_partitions, compare_op, target_merged_tiles_number, items_per_tile); agent.Process(); } } template void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) DeviceMergeSortMergeKernel(bool ping, KeyIteratorT keys_ping, ValueIteratorT items_ping, OffsetT keys_count, KeyT *keys_pong, ValueT *items_pong, CompareOpT compare_op, OffsetT *merge_partitions, OffsetT target_merged_tiles_number, char *vshmem) { extern __shared__ char shmem[]; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy; using AgentMergeT = AgentMerge; const OffsetT vshmem_offset = blockIdx.x * AgentMergeT::SHARED_MEMORY_SIZE; typename AgentMergeT::TempStorage &storage = *reinterpret_cast( UseVShmem ? vshmem + vshmem_offset : shmem); AgentMergeT agent( ping, storage, THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), keys_count, keys_pong, items_pong, keys_ping, items_ping, compare_op, merge_partitions, target_merged_tiles_number); agent.Process(); } /******************************************************************************* * Policy ******************************************************************************/ template struct DeviceMergeSortPolicy { using KeyT = cub::detail::value_t; //---------------------------------------------------------------------------- // Architecture-specific tuning policies //---------------------------------------------------------------------------- struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { using MergeSortPolicy = AgentMergeSortPolicy<256, Nominal4BItemsToItems(11), cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_STORE_WARP_TRANSPOSE>; }; // NVBug 3384810 #if defined(_NVHPC_CUDA) using Policy520 = Policy350; #else struct Policy520 : ChainedPolicy<520, Policy520, Policy350> { using MergeSortPolicy = AgentMergeSortPolicy<512, Nominal4BItemsToItems(15), cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_STORE_WARP_TRANSPOSE>; }; #endif struct Policy600 : ChainedPolicy<600, Policy600, Policy520> { using MergeSortPolicy = AgentMergeSortPolicy<256, Nominal4BItemsToItems(17), cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>; }; /// MaxPolicy using MaxPolicy = Policy600; }; template struct BlockSortLauncher { int num_tiles; std::size_t block_sort_shmem_size; bool ping; KeyInputIteratorT d_input_keys; ValueInputIteratorT d_input_items; KeyIteratorT d_output_keys; ValueIteratorT d_output_items; OffsetT num_items; CompareOpT compare_op; cudaStream_t stream; KeyT *keys_buffer; ValueT *items_buffer; char* vshmem_ptr; CUB_RUNTIME_FUNCTION __forceinline__ BlockSortLauncher(int num_tiles, std::size_t block_sort_shmem_size, bool ping, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, KeyT *keys_buffer, ValueT *items_buffer, char *vshmem_ptr) : num_tiles(num_tiles) , block_sort_shmem_size(block_sort_shmem_size) , ping(ping) , d_input_keys(d_input_keys) , d_input_items(d_input_items) , d_output_keys(d_output_keys) , d_output_items(d_output_items) , num_items(num_items) , compare_op(compare_op) , stream(stream) , keys_buffer(keys_buffer) , items_buffer(items_buffer) , vshmem_ptr(vshmem_ptr) {} CUB_RUNTIME_FUNCTION __forceinline__ void launch() const { if (vshmem_ptr) { launch_impl(); } else { launch_impl(); } } template CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl() const { constexpr bool use_vshmem = (AgentFitsIntoDefaultShmemSize == false) && UseVShmem; THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, ActivePolicyT::MergeSortPolicy::BLOCK_THREADS, use_vshmem ? 0 : block_sort_shmem_size, stream) .doit(DeviceMergeSortBlockSortKernel, ping, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, keys_buffer, items_buffer, compare_op, vshmem_ptr); } }; template struct MergeLauncher { int num_tiles; std::size_t merge_shmem_size; KeyIteratorT d_keys; ValueIteratorT d_items; OffsetT num_items; CompareOpT compare_op; OffsetT *merge_partitions; cudaStream_t stream; KeyT *keys_buffer; ValueT *items_buffer; char *vshmem_ptr; CUB_RUNTIME_FUNCTION __forceinline__ MergeLauncher(int num_tiles, std::size_t merge_shmem_size, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, OffsetT *merge_partitions, cudaStream_t stream, KeyT *keys_buffer, ValueT *items_buffer, char *vshmem_ptr) : num_tiles(num_tiles) , merge_shmem_size(merge_shmem_size) , d_keys(d_keys) , d_items(d_items) , num_items(num_items) , compare_op(compare_op) , merge_partitions(merge_partitions) , stream(stream) , keys_buffer(keys_buffer) , items_buffer(items_buffer) , vshmem_ptr(vshmem_ptr) {} CUB_RUNTIME_FUNCTION __forceinline__ void launch(bool ping, OffsetT target_merged_tiles_number) const { if (vshmem_ptr) { launch_impl(ping, target_merged_tiles_number); } else { launch_impl(ping, target_merged_tiles_number); } } template CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl(bool ping, OffsetT target_merged_tiles_number) const { constexpr bool use_vshmem = (AgentFitsIntoDefaultShmemSize == false) && UseVShmem; THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, ActivePolicyT::MergeSortPolicy::BLOCK_THREADS, use_vshmem ? 0 : merge_shmem_size, stream) .doit(DeviceMergeSortMergeKernel, ping, d_keys, d_items, num_items, keys_buffer, items_buffer, compare_op, merge_partitions, target_merged_tiles_number, vshmem_ptr); } }; template > struct DispatchMergeSort : SelectedPolicy { using KeyT = cub::detail::value_t; using ValueT = cub::detail::value_t; /// Whether or not there are values to be trucked along with keys static constexpr bool KEYS_ONLY = std::is_same::value; // Problem state /// Device-accessible allocation of temporary storage. When NULL, the required /// allocation size is written to \p temp_storage_bytes and no work is done. void *d_temp_storage; /// Reference to size in bytes of \p d_temp_storage allocation std::size_t &temp_storage_bytes; /// Pointer to the input sequence of unsorted input keys KeyInputIteratorT d_input_keys; /// Pointer to the input sequence of unsorted input values ValueInputIteratorT d_input_items; /// Pointer to the output sequence of sorted input keys KeyIteratorT d_output_keys; /// Pointer to the output sequence of sorted input values ValueIteratorT d_output_items; /// Number of items to sort OffsetT num_items; /// Comparison function object which returns true if the first argument is /// ordered before the second CompareOpT compare_op; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; // Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchMergeSort(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input_keys(d_input_keys) , d_input_items(d_input_items) , d_output_keys(d_output_keys) , d_output_items(d_output_items) , num_items(num_items) , compare_op(compare_op) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchMergeSort(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input_keys(d_input_keys) , d_input_items(d_input_items) , d_output_keys(d_output_keys) , d_output_items(d_output_items) , num_items(num_items) , compare_op(compare_op) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } // Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { using MergePolicyT = typename ActivePolicyT::MergeSortPolicy; using MaxPolicyT = typename DispatchMergeSort::MaxPolicy; using BlockSortAgentT = AgentBlockSort; using MergeAgentT = AgentMerge; cudaError error = cudaSuccess; if (num_items == 0) return error; do { // Get device ordinal int device_ordinal = 0; if (CubDebug(error = cudaGetDevice(&device_ordinal))) { break; } // Get shared memory size const auto tile_size = MergePolicyT::ITEMS_PER_TILE; const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size); /** * Merge sort supports large types, which can lead to excessive shared * memory size requirements. In these cases, merge sort allocates virtual * shared memory that resides in global memory: * ``` * extern __shared__ char shmem[]; * typename AgentT::TempStorage &storage = * *reinterpret_cast( * UseVShmem ? vshmem + vshmem_offset : shmem); * ``` * Having `UseVShmem` as a runtime variable leads to the generation of * generic loads and stores, which causes a slowdown. Therefore, * `UseVShmem` has to be known at compilation time. * In the generic case, available shared memory size is queried at runtime * to check if kernels requirements are satisfied. Since the query result * is not known at compile-time, merge sort kernels are specialized for * both cases. * To address increased compilation time, the dispatch layer checks * whether kernels requirements fit into default shared memory * size (48KB). In this case, there's no need for virtual shared * memory specialization. */ constexpr std::size_t default_shared_memory_size = 48 * 1024; constexpr auto block_sort_shmem_size = static_cast(BlockSortAgentT::SHARED_MEMORY_SIZE); constexpr bool block_sort_fits_into_default_shmem = block_sort_shmem_size < default_shared_memory_size; constexpr auto merge_shmem_size = static_cast(MergeAgentT::SHARED_MEMORY_SIZE); constexpr bool merge_fits_into_default_shmem = merge_shmem_size < default_shared_memory_size; constexpr bool runtime_shmem_size_check_is_required = !(merge_fits_into_default_shmem && block_sort_fits_into_default_shmem); const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); const auto temporary_keys_storage_size = static_cast(num_items * sizeof(KeyT)); const auto temporary_values_storage_size = static_cast(num_items * sizeof(ValueT)) * !KEYS_ONLY; std::size_t virtual_shared_memory_size = 0; bool block_sort_requires_vshmem = false; bool merge_requires_vshmem = false; if (runtime_shmem_size_check_is_required) { int max_shmem = 0; if (CubDebug( error = cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_ordinal))) { break; } block_sort_requires_vshmem = block_sort_shmem_size > static_cast(max_shmem); merge_requires_vshmem = merge_shmem_size > static_cast(max_shmem); virtual_shared_memory_size = detail::VshmemSize(static_cast(max_shmem), (cub::max)(block_sort_shmem_size, merge_shmem_size), static_cast(num_tiles)); } void *allocations[4] = {nullptr, nullptr, nullptr, nullptr}; std::size_t allocation_sizes[4] = {merge_partitions_size, temporary_keys_storage_size, temporary_values_storage_size, virtual_shared_memory_size}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation break; } const int num_passes = static_cast(THRUST_NS_QUALIFIER::detail::log2_ri(num_tiles)); /* * The algorithm consists of stages. At each stage, there are input and * output arrays. There are two pairs of arrays allocated (keys and items). * One pair is from function arguments and another from temporary storage. * Ping is a helper variable that controls which of these two pairs of * arrays is an input and which is an output for a current stage. If the * ping is true - the current stage stores its result in the temporary * storage. The temporary storage acts as input data otherwise. * * Block sort is executed before the main loop. It stores its result in * the pair of arrays that will be an input of the next stage. The initial * value of the ping variable is selected so that the result of the final * stage is stored in the input arrays. */ bool ping = num_passes % 2 == 0; auto merge_partitions = reinterpret_cast(allocations[0]); auto keys_buffer = reinterpret_cast(allocations[1]); auto items_buffer = reinterpret_cast(allocations[2]); char *vshmem_ptr = virtual_shared_memory_size > 0 ? reinterpret_cast(allocations[3]) : nullptr; // Invoke DeviceReduceKernel BlockSortLauncher block_sort_launcher(static_cast(num_tiles), block_sort_shmem_size, ping, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream, keys_buffer, items_buffer, block_sort_requires_vshmem ? vshmem_ptr : nullptr); block_sort_launcher.launch(); error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } const OffsetT num_partitions = num_tiles + 1; const int threads_per_partition_block = 256; const int partition_grid_size = static_cast( cub::DivideAndRoundUp(num_partitions, threads_per_partition_block)); MergeLauncher merge_launcher(static_cast(num_tiles), merge_shmem_size, d_output_keys, d_output_items, num_items, compare_op, merge_partitions, stream, keys_buffer, items_buffer, merge_requires_vshmem ? vshmem_ptr : nullptr); for (int pass = 0; pass < num_passes; ++pass, ping = !ping) { OffsetT target_merged_tiles_number = OffsetT(2) << pass; // Partition THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( partition_grid_size, threads_per_partition_block, 0, stream) .doit(DeviceMergeSortPartitionKernel, ping, d_output_keys, keys_buffer, num_items, num_partitions, merge_partitions, compare_op, target_merged_tiles_number, tile_size); error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Merge merge_launcher.launch(ping, target_merged_tiles_number); error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } } } while (0); return error; } CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream) { using MaxPolicyT = typename DispatchMergeSort::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchMergeSort dispatch(d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_radix_sort.cuh000066400000000000000000003124111434614775400226510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // suppress warnings triggered by #pragma unroll: // "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]" #if defined(__clang__) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wpass-failed" #endif CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. */ template < typename ChainedPolicyT, ///< Chained tuning policy bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) __global__ void DeviceRadixSortUpsweepKernel( const KeyT *d_keys, ///< [in] Input keys buffer OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) OffsetT /*num_items*/, ///< [in] Total number of input data items int current_bit, ///< [in] Bit position of current radix digit int num_bits, ///< [in] Number of bits of current radix digit GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block { using ActiveUpsweepPolicyT = cub::detail::conditional_t< ALT_DIGIT_BITS, typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>; using ActiveDownsweepPolicyT = cub::detail::conditional_t< ALT_DIGIT_BITS, typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>; enum { TILE_ITEMS = CUB_MAX( ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) }; // Parameterize AgentRadixSortUpsweep type for the current configuration typedef AgentRadixSortUpsweep< ActiveUpsweepPolicyT, KeyT, OffsetT> AgentRadixSortUpsweepT; // Shared memory storage __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block even_share.template BlockInit(); AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); CTA_SYNC(); // Write out digit counts (striped) upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); } /** * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms */ template < typename ChainedPolicyT, ///< Chained tuning policy typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) __global__ void RadixSortScanBinsKernel( OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) int num_counts) ///< [in] Total number of bin-counts { // Parameterize the AgentScan type for the current configuration typedef AgentScan< typename ChainedPolicyT::ActivePolicy::ScanPolicy, OffsetT*, OffsetT*, cub::Sum, OffsetT, OffsetT, OffsetT> AgentScanT; // Shared memory storage __shared__ typename AgentScanT::TempStorage temp_storage; // Block scan instance AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; // Process full input tiles int block_offset = 0; BlockScanRunningPrefixOp prefix_op(0, Sum()); while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) { block_scan.template ConsumeTile(block_offset, prefix_op); block_offset += AgentScanT::TILE_ITEMS; } // Process the remaining partial tile (if any). if (block_offset < num_counts) { block_scan.template ConsumeTile(block_offset, prefix_op, num_counts - block_offset); } } /** * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. */ template < typename ChainedPolicyT, ///< Chained tuning policy bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) __global__ void DeviceRadixSortDownsweepKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer ValueT *d_values_out, ///< [in] Output values buffer OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) OffsetT num_items, ///< [in] Total number of input data items int current_bit, ///< [in] Bit position of current radix digit int num_bits, ///< [in] Number of bits of current radix digit GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block { using ActiveUpsweepPolicyT = cub::detail::conditional_t< ALT_DIGIT_BITS, typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>; using ActiveDownsweepPolicyT = cub::detail::conditional_t< ALT_DIGIT_BITS, typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>; enum { TILE_ITEMS = CUB_MAX( ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) }; // Parameterize AgentRadixSortDownsweep type for the current configuration typedef AgentRadixSortDownsweep< ActiveDownsweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> AgentRadixSortDownsweepT; // Shared memory storage __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; // Initialize even-share descriptor for this thread block even_share.template BlockInit(); // Process input tiles AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( even_share.block_offset, even_share.block_end); } /** * Single pass kernel entry point (single-block). Fully sorts a tile of input. */ template < typename ChainedPolicyT, ///< Chained tuning policy bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceRadixSortSingleTileKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer ValueT *d_values_out, ///< [in] Output values buffer OffsetT num_items, ///< [in] Total number of input data items int current_bit, ///< [in] Bit position of current radix digit int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison { // Constants enum { BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, KEYS_ONLY = std::is_same::value, }; // BlockRadixSort type typedef BlockRadixSort< KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, ValueT, ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> BlockRadixSortT; // BlockLoad type (keys) typedef BlockLoad< KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; // BlockLoad type (values) typedef BlockLoad< ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; // Unsigned word for key bits typedef typename Traits::UnsignedBits UnsignedBitsT; // Shared memory storage __shared__ union TempStorage { typename BlockRadixSortT::TempStorage sort; typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadValues::TempStorage load_values; } temp_storage; // Keys and values for the block KeyT keys[ITEMS_PER_THREAD]; ValueT values[ITEMS_PER_THREAD]; // Get default (min/max) value for out-of-bounds keys UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; KeyT default_key = reinterpret_cast(default_key_bits); // Load keys BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); CTA_SYNC(); // Load values if (!KEYS_ONLY) { // Register pressure work-around: moving num_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads num_items = ShuffleIndex(num_items, 0, 0xffffffff); BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); CTA_SYNC(); } // Sort tile BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( keys, values, current_bit, end_bit, Int2Type(), Int2Type()); // Store keys and values #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; if (item_offset < num_items) { d_keys_out[item_offset] = keys[ITEM]; if (!KEYS_ONLY) d_values_out[item_offset] = values[ITEM]; } } } /** * Segmented radix sorting pass (one block per segment) */ template < typename ChainedPolicyT, ///< Chained tuning policy bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedRadixSortKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer ValueT *d_values_out, ///< [in] Output values buffer BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data int current_bit, ///< [in] Bit position of current radix digit int pass_bits) ///< [in] Number of bits of current radix digit { // // Constants // using SegmentedPolicyT = cub::detail::conditional_t< ALT_DIGIT_BITS, typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>; enum { BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, RADIX_BITS = SegmentedPolicyT::RADIX_BITS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_DIGITS = 1 << RADIX_BITS, KEYS_ONLY = std::is_same::value, }; // Upsweep type using BlockUpsweepT = AgentRadixSortUpsweep; // Digit-scan type using DigitScanT = BlockScan; // Downsweep type using BlockDownsweepT = AgentRadixSortDownsweep; enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD }; // // Process input tiles // // Shared memory storage __shared__ union { typename BlockUpsweepT::TempStorage upsweep; typename BlockDownsweepT::TempStorage downsweep; struct { volatile OffsetT reverse_counts_in[RADIX_DIGITS]; volatile OffsetT reverse_counts_out[RADIX_DIGITS]; typename DigitScanT::TempStorage scan; }; } temp_storage; OffsetT segment_begin = d_begin_offsets[blockIdx.x]; OffsetT segment_end = d_end_offsets[blockIdx.x]; OffsetT num_items = segment_end - segment_begin; // Check if empty segment if (num_items <= 0) return; // Upsweep BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); upsweep.ProcessRegion(segment_begin, segment_end); CTA_SYNC(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); CTA_SYNC(); if (IS_DESCENDING) { // Reverse bin counts #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; } } // Scan OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { bin_offset[track] += segment_begin; } if (IS_DESCENDING) { // Reverse bin offsets #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; } } CTA_SYNC(); // Downsweep BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); downsweep.ProcessRegion(segment_begin, segment_end); } /****************************************************************************** * Onesweep kernels ******************************************************************************/ /** * Kernel for computing multiple histograms */ /** * Histogram kernel */ template < typename ChainedPolicyT, bool IS_DESCENDING, typename KeyT, typename OffsetT> __global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) DeviceRadixSortHistogramKernel (OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit) { typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT; typedef AgentRadixSortHistogram AgentT; __shared__ typename AgentT::TempStorage temp_storage; AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit); agent.Process(); } template < typename ChainedPolicyT, bool IS_DESCENDING, typename KeyT, typename ValueT, typename OffsetT, typename PortionOffsetT, typename AtomicOffsetT = PortionOffsetT> __global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) DeviceRadixSortOnesweepKernel (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, const ValueT* d_values_in, PortionOffsetT num_items, int current_bit, int num_bits) { typedef typename ChainedPolicyT::ActivePolicy::OnesweepPolicy OnesweepPolicyT; typedef AgentRadixSortOnesweep AgentT; __shared__ typename AgentT::TempStorage s; AgentT agent(s, d_lookback, d_ctrs, d_bins_out, d_bins_in, d_keys_out, d_keys_in, d_values_out, d_values_in, num_items, current_bit, num_bits); agent.Process(); } /** * Exclusive sum kernel */ template < typename ChainedPolicyT, typename OffsetT> __global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) { typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT; const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS; const int RADIX_DIGITS = 1 << RADIX_BITS; const int BLOCK_THREADS = ExclusiveSumPolicyT::BLOCK_THREADS; const int BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS; typedef cub::BlockScan BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; // load the bins OffsetT bins[BINS_PER_THREAD]; int bin_start = blockIdx.x * RADIX_DIGITS; #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = threadIdx.x * BINS_PER_THREAD + u; if (bin >= RADIX_DIGITS) break; bins[u] = d_bins[bin_start + bin]; } // compute offsets BlockScan(temp_storage).ExclusiveSum(bins, bins); // store the offsets #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = threadIdx.x * BINS_PER_THREAD + u; if (bin >= RADIX_DIGITS) break; d_bins[bin_start + bin] = bins[u]; } } /****************************************************************************** * Policy ******************************************************************************/ /** * Tuning policy for kernel specialization */ template < typename KeyT, ///< Key type typename ValueT, ///< Value type typename OffsetT> ///< Signed integer type for global offsets struct DeviceRadixSortPolicy { //------------------------------------------------------------------------------ // Constants //------------------------------------------------------------------------------ // Whether this is a keys-only (or key-value) sort constexpr static bool KEYS_ONLY = std::is_same::value; // Dominant-sized key/value type using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; //------------------------------------------------------------------------------ // Architecture-specific tuning policies //------------------------------------------------------------------------------ /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) ONESWEEP = false, ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // Scan policy typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; // Keys-only downsweep policies typedef AgentRadixSortDownsweepPolicy <128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys; // Key-value pairs downsweep policies typedef DownsweepPolicyKeys DownsweepPolicyPairs; typedef AgentRadixSortDownsweepPolicy <128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs; // Downsweep policies using DownsweepPolicy = cub::detail::conditional_t< KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>; using AltDownsweepPolicy = cub::detail::conditional_t; // Upsweep policies using UpsweepPolicy = DownsweepPolicy; using AltUpsweepPolicy = AltDownsweepPolicy; // Single-tile policy using SingleTilePolicy = DownsweepPolicy; // Segmented policies using SegmentedPolicy = DownsweepPolicy; using AltSegmentedPolicy = AltDownsweepPolicy; }; /// SM50 struct Policy500 : ChainedPolicy<500, Policy500, Policy350> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) ONESWEEP = false, ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy <160, 39, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy <192, 31, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy <256, 11, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM60 (GP100) struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), // 10.0B 32b keys/s (GP100, 64M random keys) ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, OFFSET_64BIT ? 29 : 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy <256, 25, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <192, OFFSET_64BIT ? 32 : 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM61 (GP104) struct Policy610 : ChainedPolicy<610, Policy610, Policy600> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy <384, 31, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <256, 35, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM62 (Tegra, less RF) struct Policy620 : ChainedPolicy<620, Policy620, Policy610> { enum { PRIMARY_RADIX_BITS = 5, ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS> AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef DownsweepPolicy SegmentedPolicy; typedef AltDownsweepPolicy AltSegmentedPolicy; }; /// SM70 (GV100) struct Policy700 : ChainedPolicy<700, Policy700, Policy620> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), // 15.8B 32b keys/s (V100-SXM2, 64M random keys) ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, OFFSET_64BIT ? 46 : 47, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy <256, OFFSET_64BIT ? 46 : 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy700> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy <128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy <384, OFFSET_64BIT && sizeof(KeyT) == 4 && !KEYS_ONLY ? 17 : 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// MaxPolicy typedef Policy800 MaxPolicy; }; /****************************************************************************** * Single-problem dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort */ template < bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type typename OffsetT, ///< Signed integer type for global offsets typename SelectedPolicy = DeviceRadixSortPolicy > struct DispatchRadixSort : SelectedPolicy { //------------------------------------------------------------------------------ // Constants //------------------------------------------------------------------------------ // Whether this is a keys-only (or key-value) sort constexpr static bool KEYS_ONLY = std::is_same::value; //------------------------------------------------------------------------------ // Problem state //------------------------------------------------------------------------------ void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values OffsetT num_items; ///< [in] Number of items to sort int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. int ptx_version; ///< [in] PTX version bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers //------------------------------------------------------------------------------ // Constructor //------------------------------------------------------------------------------ /// Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchRadixSort( void* d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage), temp_storage_bytes(temp_storage_bytes), d_keys(d_keys), d_values(d_values), num_items(num_items), begin_bit(begin_bit), end_bit(end_bit), stream(stream), ptx_version(ptx_version), is_overwrite_okay(is_overwrite_okay) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchRadixSort( void* d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage), temp_storage_bytes(temp_storage_bytes), d_keys(d_keys), d_values(d_values), num_items(num_items), begin_bit(begin_bit), end_bit(end_bit), stream(stream), ptx_version(ptx_version), is_overwrite_okay(is_overwrite_okay) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //------------------------------------------------------------------------------ // Small-problem (single tile) invocation //------------------------------------------------------------------------------ /// Invoke a single block to sort in-core template < typename ActivePolicyT, ///< Umbrella policy active for the target device typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile( SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel { cudaError error = cudaSuccess; do { // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) { temp_storage_bytes = 1; break; } // Log single_tile_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); #endif // Invoke upsweep_kernel with same grid size as downsweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream ).doit(single_tile_kernel, d_keys.Current(), d_keys.Alternate(), d_values.Current(), d_values.Alternate(), num_items, begin_bit, end_bit); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Update selector d_keys.selector ^= 1; d_values.selector ^= 1; } while (0); return error; } //------------------------------------------------------------------------------ // Normal problem size invocation //------------------------------------------------------------------------------ /** * Invoke a three-kernel sorting pass at the current bit. */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass( const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, OffsetT *d_spine, int /*spine_length*/, int ¤t_bit, PassConfigT &pass_config) { cudaError error = cudaSuccess; do { int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log upsweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); #endif // Spine length written by the upsweep kernel in the current pass. int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits; // Invoke upsweep_kernel with same grid size as downsweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream ).doit(pass_config.upsweep_kernel, d_keys_in, d_spine, num_items, current_bit, pass_bits, pass_config.even_share); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Log scan_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); #endif // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, pass_config.scan_config.block_threads, 0, stream ).doit(pass_config.scan_kernel, d_spine, pass_spine_length); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Log downsweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); #endif // Invoke downsweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream ).doit(pass_config.downsweep_kernel, d_keys_in, d_keys_out, d_values_in, d_values_out, d_spine, num_items, current_bit, pass_bits, pass_config.even_share); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Update current bit current_bit += pass_bits; } while (0); return error; } /// Pass configuration structure template < typename UpsweepKernelT, typename ScanKernelT, typename DownsweepKernelT> struct PassConfig { UpsweepKernelT upsweep_kernel; KernelConfig upsweep_config; ScanKernelT scan_kernel; KernelConfig scan_config; DownsweepKernelT downsweep_kernel; KernelConfig downsweep_config; int radix_bits; int radix_digits; int max_downsweep_grid_size; GridEvenShare even_share; /// Initialize pass configuration template < typename UpsweepPolicyT, typename ScanPolicyT, typename DownsweepPolicyT> CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig( UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, int /*ptx_version*/, int sm_count, OffsetT num_items) { cudaError error = cudaSuccess; do { this->upsweep_kernel = upsweep_kernel; this->scan_kernel = scan_kernel; this->downsweep_kernel = downsweep_kernel; radix_bits = DownsweepPolicyT::RADIX_BITS; radix_digits = 1 << radix_bits; if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; if (CubDebug(error = scan_config.Init(scan_kernel))) break; if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0); even_share.DispatchInit( num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); } while (0); return error; } }; template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeOnesweep() { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; // PortionOffsetT is used for offsets within a portion, and must be signed. typedef int PortionOffsetT; typedef PortionOffsetT AtomicOffsetT; // compute temporary storage size const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS; const int RADIX_DIGITS = 1 << RADIX_BITS; const int ONESWEEP_ITEMS_PER_THREAD = ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD; const int ONESWEEP_BLOCK_THREADS = ActivePolicyT::OnesweepPolicy::BLOCK_THREADS; const int ONESWEEP_TILE_ITEMS = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS; // portions handle inputs with >=2**30 elements, due to the way lookback works // for testing purposes, one portion is <= 2**28 elements const PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS; int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS); OffsetT num_portions = static_cast(cub::DivideAndRoundUp(num_items, PORTION_SIZE)); PortionOffsetT max_num_blocks = cub::DivideAndRoundUp( static_cast( CUB_MIN(num_items, static_cast(PORTION_SIZE))), ONESWEEP_TILE_ITEMS); size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT); size_t allocation_sizes[] = { // bins num_portions * num_passes * RADIX_DIGITS * sizeof(OffsetT), // lookback max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), // extra key buffer is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT), // extra value buffer is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size, // counters num_portions * num_passes * sizeof(AtomicOffsetT), }; const int NUM_ALLOCATIONS = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]); void* allocations[NUM_ALLOCATIONS] = {}; AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); // just return if no temporary storage is provided cudaError_t error = cudaSuccess; if (d_temp_storage == NULL) return error; OffsetT* d_bins = (OffsetT*)allocations[0]; AtomicOffsetT* d_lookback = (AtomicOffsetT*)allocations[1]; KeyT* d_keys_tmp2 = (KeyT*)allocations[2]; ValueT* d_values_tmp2 = (ValueT*)allocations[3]; AtomicOffsetT* d_ctrs = (AtomicOffsetT*)allocations[4]; do { // initialization if (CubDebug(error = cudaMemsetAsync( d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream))) break; // compute num_passes histograms with RADIX_DIGITS bins each if (CubDebug(error = cudaMemsetAsync (d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream))) break; int device = -1; int num_sms = 0; if (CubDebug(error = cudaGetDevice(&device))) break; if (CubDebug(error = cudaDeviceGetAttribute( &num_sms, cudaDevAttrMultiProcessorCount, device))) break; const int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS; int histo_blocks_per_sm = 1; auto histogram_kernel = DeviceRadixSortHistogramKernel< MaxPolicyT, IS_DESCENDING, KeyT, OffsetT>; if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break; // log histogram_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, " "%d SM occupancy, bit_grain %d\n", histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, reinterpret_cast(stream), ActivePolicyT::HistogramPolicy::ITEMS_PER_THREAD, histo_blocks_per_sm, ActivePolicyT::HistogramPolicy::RADIX_BITS); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream ).doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit); if (CubDebug(error)) { break; } error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // exclusive sums to determine starts const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; // log exclusive_sum_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n", num_passes, SCAN_BLOCK_THREADS, reinterpret_cast(stream), ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_passes, SCAN_BLOCK_THREADS, 0, stream ).doit(DeviceRadixSortExclusiveSumKernel, d_bins); if (CubDebug(error)) { break; } error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // use the other buffer if no overwrite is allowed KeyT* d_keys_tmp = d_keys.Alternate(); ValueT* d_values_tmp = d_values.Alternate(); if (!is_overwrite_okay && num_passes % 2 == 0) { d_keys.d_buffers[1] = d_keys_tmp2; d_values.d_buffers[1] = d_values_tmp2; } for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS); for (OffsetT portion = 0; portion < num_portions; ++portion) { PortionOffsetT portion_num_items = static_cast( CUB_MIN(num_items - portion * PORTION_SIZE, static_cast(PORTION_SIZE))); PortionOffsetT num_blocks = cub::DivideAndRoundUp(portion_num_items, ONESWEEP_TILE_ITEMS); if (CubDebug(error = cudaMemsetAsync( d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream))) break; // log onesweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, " "current bit %d, bit_grain %d, portion %d/%d\n", num_blocks, ONESWEEP_BLOCK_THREADS, reinterpret_cast(stream), ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD, current_bit, num_bits, static_cast(portion), static_cast(num_portions)); #endif auto onesweep_kernel = DeviceRadixSortOnesweepKernel< MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>; error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream ).doit(onesweep_kernel, d_lookback, d_ctrs + portion * num_passes + pass, portion < num_portions - 1 ? d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : NULL, d_bins + (portion * num_passes + pass) * RADIX_DIGITS, d_keys.Alternate(), d_keys.Current() + portion * PORTION_SIZE, d_values.Alternate(), d_values.Current() + portion * PORTION_SIZE, portion_num_items, current_bit, num_bits); if (CubDebug(error)) { break; } error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } if (error != cudaSuccess) { break; } // use the temporary buffers if no overwrite is allowed if (!is_overwrite_okay && pass == 0) { d_keys = num_passes % 2 == 0 ? DoubleBuffer(d_keys_tmp, d_keys_tmp2) : DoubleBuffer(d_keys_tmp2, d_keys_tmp); d_values = num_passes % 2 == 0 ? DoubleBuffer(d_values_tmp, d_values_tmp2) : DoubleBuffer(d_values_tmp2, d_values_tmp); } d_keys.selector ^= 1; d_values.selector ^= 1; } } while (0); return error; } /// Invocation (run multiple digit passes) template < typename ActivePolicyT, ///< Umbrella policy active for the target device typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel typename ScanKernelT, ///< Function type of cub::SpineScanKernel typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses( UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Get SM count int sm_count; if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; // Init regular and alternate-digit kernel configurations PassConfig pass_config, alt_pass_config; error = pass_config.template InitPassConfig< typename ActivePolicyT::UpsweepPolicy, typename ActivePolicyT::ScanPolicy, typename ActivePolicyT::DownsweepPolicy>(upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items); if (error) { break; } error = alt_pass_config.template InitPassConfig< typename ActivePolicyT::AltUpsweepPolicy, typename ActivePolicyT::ScanPolicy, typename ActivePolicyT::AltDownsweepPolicy>(alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items); if (error) { break; } // Get maximum spine length int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; // Temporary storage allocation requirements void* allocations[3] = {}; size_t allocation_sizes[3] = { spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) return cudaSuccess; // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size int num_bits = end_bit - begin_bit; int num_passes = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); // Alias the temporary storage allocations OffsetT *d_spine = static_cast(allocations[0]); DoubleBuffer d_keys_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); DoubleBuffer d_values_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); // Run first pass, consuming from the input's current buffers int current_bit = begin_bit; if (CubDebug(error = InvokePass( d_keys.Current(), d_keys_remaining_passes.Current(), d_values.Current(), d_values_remaining_passes.Current(), d_spine, spine_length, current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; // Run remaining passes while (current_bit < end_bit) { if (CubDebug(error = InvokePass( d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], d_spine, spine_length, current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; // Invert selectors d_keys_remaining_passes.selector ^= 1; d_values_remaining_passes.selector ^= 1; } // Update selector if (!is_overwrite_okay) { num_passes = 1; // Sorted data always ends up in the other vector } d_keys.selector = (d_keys.selector + num_passes) & 1; d_values.selector = (d_values.selector + num_passes) & 1; } while (0); return error; } //------------------------------------------------------------------------------ // Chained policy invocation //------------------------------------------------------------------------------ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeManyTiles(Int2Type) { // Invoke upsweep-downsweep typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; return InvokePasses( DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeManyTiles(Int2Type) { // Invoke onesweep return InvokeOnesweep(); } CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeCopy() { // is_overwrite_okay == false here // Return the number of temporary bytes if requested if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } // Copy keys #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long)num_items, (long long)stream); #endif cudaError_t error = cudaSuccess; error = cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), cudaMemcpyDefault, stream); if (CubDebug(error)) { return error; } if (CubDebug(error = detail::DebugSyncStream(stream))) { return error; } d_keys.selector ^= 1; // Copy values if necessary if (!KEYS_ONLY) { #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long)num_items, (long long)stream); #endif error = cudaMemcpyAsync(d_values.Alternate(), d_values.Current(), num_items * sizeof(ValueT), cudaMemcpyDefault, stream); if (CubDebug(error)) { return error; } if (CubDebug(error = detail::DebugSyncStream(stream))) { return error; } } d_values.selector ^= 1; return error; } /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; // Return if empty problem, or if no bits to sort and double-buffering is used if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay)) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; } return cudaSuccess; } // Check if simple copy suffices (is_overwrite_okay == false at this point) if (begin_bit == end_bit) { bool has_uva = false; cudaError_t error = detail::HasUVA(has_uva); if (error != cudaSuccess) return error; if (has_uva) { return InvokeCopy(); } } // Force kernel code-generation in all compiler passes if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) { // Small, single tile size return InvokeSingleTile( DeviceRadixSortSingleTileKernel); } else { // Regular size return InvokeManyTiles(Int2Type()); } } //------------------------------------------------------------------------------ // Dispatch entrypoints //------------------------------------------------------------------------------ /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values OffsetT num_items, ///< [in] Number of items to sort int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers cudaStream_t stream) ///< [in] CUDA stream to launch kernels within. Default is stream0. { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; cudaError_t error; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Create dispatch functor DispatchRadixSort dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } }; /****************************************************************************** * Segmented dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort */ template < bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator typename OffsetT, ///< Signed integer type for global offsets typename SelectedPolicy = DeviceRadixSortPolicy > struct DispatchSegmentedRadixSort : SelectedPolicy { //------------------------------------------------------------------------------ // Constants //------------------------------------------------------------------------------ // Whether this is a keys-only (or key-value) sort constexpr static bool KEYS_ONLY = std::is_same::value; //------------------------------------------------------------------------------ // Parameter members //------------------------------------------------------------------------------ void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values OffsetT num_items; ///< [in] Number of items to sort OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data BeginOffsetIteratorT d_begin_offsets; ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* EndOffsetIteratorT d_end_offsets; ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. int ptx_version; ///< [in] PTX version bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers //------------------------------------------------------------------------------ // Constructors //------------------------------------------------------------------------------ /// Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedRadixSort( void* d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage), temp_storage_bytes(temp_storage_bytes), d_keys(d_keys), d_values(d_values), num_items(num_items), num_segments(num_segments), d_begin_offsets(d_begin_offsets), d_end_offsets(d_end_offsets), begin_bit(begin_bit), end_bit(end_bit), is_overwrite_okay(is_overwrite_okay), stream(stream), ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedRadixSort( void* d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage), temp_storage_bytes(temp_storage_bytes), d_keys(d_keys), d_values(d_values), num_items(num_items), num_segments(num_segments), d_begin_offsets(d_begin_offsets), d_end_offsets(d_end_offsets), begin_bit(begin_bit), end_bit(end_bit), is_overwrite_okay(is_overwrite_okay), stream(stream), ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //------------------------------------------------------------------------------ // Multi-segment invocation //------------------------------------------------------------------------------ /// Invoke a three-kernel sorting pass at the current bit. template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass( const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int ¤t_bit, PassConfigT &pass_config) { cudaError error = cudaSuccess; do { int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), " "%lld items per thread, %lld SM occupancy, " "current bit %d, bit_grain %d\n", (long long)num_segments, (long long)pass_config.segmented_config.block_threads, (long long)stream, (long long)pass_config.segmented_config.items_per_thread, (long long)pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_segments, pass_config.segmented_config.block_threads, 0, stream ).doit(pass_config.segmented_kernel, d_keys_in, d_keys_out, d_values_in, d_values_out, d_begin_offsets, d_end_offsets, num_segments, current_bit, pass_bits); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Update current bit current_bit += pass_bits; } while (0); return error; } /// PassConfig data structure template struct PassConfig { SegmentedKernelT segmented_kernel; KernelConfig segmented_config; int radix_bits; int radix_digits; /// Initialize pass configuration template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) { this->segmented_kernel = segmented_kernel; this->radix_bits = SegmentedPolicyT::RADIX_BITS; this->radix_digits = 1 << radix_bits; return CubDebug(segmented_config.Init(segmented_kernel)); } }; /// Invocation (run multiple digit passes) template < typename ActivePolicyT, ///< Umbrella policy active for the target device typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses( SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel { cudaError error = cudaSuccess; do { // Init regular and alternate kernel configurations PassConfig pass_config, alt_pass_config; if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; // Temporary storage allocation requirements void* allocations[2] = {}; size_t allocation_sizes[2] = { (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) { if (temp_storage_bytes == 0) temp_storage_bytes = 1; return cudaSuccess; } // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; int num_bits = end_bit - begin_bit; int num_passes = CUB_MAX(DivideAndRoundUp(num_bits, radix_bits), 1); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); DoubleBuffer d_keys_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); DoubleBuffer d_values_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); // Run first pass, consuming from the input's current buffers int current_bit = begin_bit; if (CubDebug(error = InvokePass( d_keys.Current(), d_keys_remaining_passes.Current(), d_values.Current(), d_values_remaining_passes.Current(), current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; // Run remaining passes while (current_bit < end_bit) { if (CubDebug(error = InvokePass( d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; // Invert selectors and update current bit d_keys_remaining_passes.selector ^= 1; d_values_remaining_passes.selector ^= 1; } // Update selector if (!is_overwrite_okay) { num_passes = 1; // Sorted data always ends up in the other vector } d_keys.selector = (d_keys.selector + num_passes) & 1; d_values.selector = (d_values.selector + num_passes) & 1; } while (0); return error; } //------------------------------------------------------------------------------ // Chained policy invocation //------------------------------------------------------------------------------ /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; // Return if empty problem, or if no bits to sort and double-buffering is used if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay)) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; } return cudaSuccess; } // Force kernel code-generation in all compiler passes return InvokePasses( DeviceSegmentedRadixSortKernel, DeviceSegmentedRadixSortKernel); } //------------------------------------------------------------------------------ // Dispatch entrypoints //------------------------------------------------------------------------------ /// Internal dispatch routine CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values int num_items, ///< [in] Number of items to sort int num_segments, ///< [in] The number of segments that comprise the sorting data BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers cudaStream_t stream) ///< [in] CUDA stream to launch kernels within. Default is stream0. { typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; cudaError_t error; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Create dispatch functor DispatchSegmentedRadixSort dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, is_overwrite_okay, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, is_overwrite_okay, stream); } }; CUB_NAMESPACE_END #if defined(__clang__) # pragma clang diagnostic pop #endif cub-2.0.1/cub/device/dispatch/dispatch_reduce.cuh000066400000000000000000001247271434614775400217550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceReduce provides device-wide, parallel operations for * computing a reduction across a sequence of data items residing within * device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Reduce region kernel entry point (multi-block). Computes privatized * reductions, one per thread block. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * * @tparam AccumT * Accumulator type * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input data items * * @param[in] even_share * Even-share descriptor for mapping an equal number of tiles onto each * thread block * * @param[in] reduction_op * Binary reduction functor */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceReduceKernel(InputIteratorT d_in, AccumT* d_out, OffsetT num_items, GridEvenShare even_share, ReductionOpT reduction_op) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; // Consume input tiles AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); // Output result if (threadIdx.x == 0) { detail::uninitialized_copy(d_out + blockIdx.x, block_aggregate); } } /** * @brief Reduce a single tile kernel entry point (single-block). Can be used * to aggregate privatized thread block reductions from a previous * multi-block reduction pass. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * * @tparam AccumT * Accumulator type * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input data items * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; // Check if empty problem if (num_items == 0) { if (threadIdx.x == 0) { *d_out = init; } return; } // Consume input tiles AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op) .ConsumeRange(OffsetT(0), num_items); // Output result if (threadIdx.x == 0) { *d_out = reduction_op(init, block_aggregate); } } /// Normalize input iterator to segment offset template __device__ __forceinline__ void NormalizeReductionOutput(T & /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/) {} /// Normalize input iterator to segment offset (specialized for arg-index) template __device__ __forceinline__ void NormalizeReductionOutput( KeyValuePairT &val, OffsetT base_offset, ArgIndexInputIterator /*itr*/) { val.key -= base_offset; } /** * Segmented reduction (one block per segment) * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate \iterator * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets * \iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets * \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first element * of the *i*th data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedReduceKernel( InputIteratorT d_in, OutputIteratorT d_out, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int /*num_segments*/, ReductionOpT reduction_op, InitT init) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; OffsetT segment_begin = d_begin_offsets[blockIdx.x]; OffsetT segment_end = d_end_offsets[blockIdx.x]; // Check if empty problem if (segment_begin == segment_end) { if (threadIdx.x == 0) { d_out[blockIdx.x] = init; } return; } // Consume input tiles AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op) .ConsumeRange(segment_begin, segment_end); // Normalize as needed NormalizeReductionOutput(block_aggregate, segment_begin, d_in); if (threadIdx.x == 0) { d_out[blockIdx.x] = reduction_op(init, block_aggregate); } } /****************************************************************************** * Policy ******************************************************************************/ /** * @tparam AccumT * Accumulator data type * * OffsetT * Signed integer type for global offsets * * ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` */ template < typename AccumT, typename OffsetT, typename ReductionOpT> struct DeviceReducePolicy { //--------------------------------------------------------------------------- // Architecture-specific tuning policies //--------------------------------------------------------------------------- /// SM30 struct Policy300 : ChainedPolicy<300, Policy300, Policy300> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 20; static constexpr int items_per_vec_load = 2; // ReducePolicy (GTX670: 154.0 @ 48M 4B items) using ReducePolicy = AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 20; static constexpr int items_per_vec_load = 4; // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B // items) using ReducePolicy = AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; /// SM60 struct Policy600 : ChainedPolicy<600, Policy600, Policy350> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; static constexpr int items_per_vec_load = 4; // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) using ReducePolicy = AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; using MaxPolicy = Policy600; }; /****************************************************************************** * Single-problem dispatch *****************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide reduction * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type */ template < typename InputIteratorT, typename OutputIteratorT, typename OffsetT, typename ReductionOpT, typename InitT = cub::detail::non_void_value_t< OutputIteratorT, cub::detail::value_t>, typename AccumT = detail::accumulator_t< ReductionOpT, InitT, cub::detail::value_t>, typename SelectedPolicy = DeviceReducePolicy> struct DispatchReduce : SelectedPolicy { //--------------------------------------------------------------------------- // Problem state //--------------------------------------------------------------------------- /// Device-accessible allocation of temporary storage. When `nullptr`, the /// required allocation size is written to `temp_storage_bytes` and no work /// is done. void *d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t &temp_storage_bytes; /// Pointer to the input sequence of data items InputIteratorT d_in; /// Pointer to the output aggregate OutputIteratorT d_out; /// Total number of input items (i.e., length of `d_in`) OffsetT num_items; /// Binary reduction functor ReductionOpT reduction_op; /// The initial value of the reduction InitT init; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; //--------------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------------- /// Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_items(num_items) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_items(num_items) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //--------------------------------------------------------------------------- // Small-problem (single tile) invocation //--------------------------------------------------------------------------- /** * @brief Invoke a single block block to reduce in-core * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam SingleTileKernelT * Function type of cub::DeviceReduceSingleTileKernel * * @param[in] single_tile_kernel * Kernel function pointer to parameterization of * cub::DeviceReduceSingleTileKernel */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do { // Return if the caller is simply requesting the size of the storage // allocation if (d_temp_storage == NULL) { temp_storage_bytes = 1; break; } // Log single_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long)stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); #endif // Invoke single_reduce_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } //--------------------------------------------------------------------------- // Normal problem size invocation (two-pass) //--------------------------------------------------------------------------- /** * @brief Invoke two-passes to reduce * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam ReduceKernelT * Function type of cub::DeviceReduceKernel * * @tparam SingleTileKernelT * Function type of cub::DeviceReduceSingleTileKernel * * @param[in] reduce_kernel * Kernel function pointer to parameterization of cub::DeviceReduceKernel * * @param[in] single_tile_kernel * Kernel function pointer to parameterization of * cub::DeviceReduceSingleTileKernel */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Get SM count int sm_count; if (CubDebug( error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) { break; } // Init regular kernel configuration KernelConfig reduce_config; if (CubDebug( error = reduce_config.Init( reduce_kernel))) { break; } int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; // Even-share work distribution int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); GridEvenShare even_share; even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); // Temporary storage allocation requirements void *allocations[1] = {}; size_t allocation_sizes[1] = { max_blocks * sizeof(AccumT) // bytes needed for privatized block // reductions }; // Alias the temporary allocations from the single storage blob (or // compute the necessary size of the blob) if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation return cudaSuccess; } // Alias the allocation for the privatized per-block reductions AccumT *d_block_reductions = (AccumT *)allocations[0]; // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = even_share.grid_size; // Log device_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, (long long)stream, ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, reduce_config.sm_occupancy); #endif // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream) .doit(reduce_kernel, d_in, d_block_reductions, num_items, even_share, reduction_op); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Log single_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long)stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); #endif // Invoke DeviceReduceSingleTileKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) .doit(single_tile_kernel, d_block_reductions, d_out, reduce_grid_size, reduction_op, init); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } //--------------------------------------------------------------------------- // Chained policy invocation //--------------------------------------------------------------------------- /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; typedef typename DispatchReduce::MaxPolicy MaxPolicyT; // Force kernel code-generation in all compiler passes if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) { // Small, single tile size return InvokeSingleTile( DeviceReduceSingleTileKernel); } else { // Regular size return InvokePasses( DeviceReduceKernel, DeviceReduceSingleTileKernel); } } //--------------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------------- /** * @brief Internal dispatch routine for computing a device-wide reduction * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream) { typedef typename DispatchReduce::MaxPolicy MaxPolicyT; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchReduce dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } }; /****************************************************************************** * Segmented dispatch *****************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide reduction * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate \iterator * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets * \iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets * \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitT * value type */ template < typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename OffsetT, typename ReductionOpT, typename InitT = cub::detail::non_void_value_t< OutputIteratorT, cub::detail::value_t>, typename AccumT = detail::accumulator_t< ReductionOpT, InitT, cub::detail::value_t>, typename SelectedPolicy = DeviceReducePolicy> struct DispatchSegmentedReduce : SelectedPolicy { //--------------------------------------------------------------------------- // Problem state //--------------------------------------------------------------------------- /// Device-accessible allocation of temporary storage. When `nullptr`, the /// required allocation size is written to `temp_storage_bytes` and no work /// is done. void *d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t &temp_storage_bytes; /// Pointer to the input sequence of data items InputIteratorT d_in; /// Pointer to the output aggregate OutputIteratorT d_out; /// The number of segments that comprise the sorting data OffsetT num_segments; /// Random-access input iterator to the sequence of beginning offsets of /// length `num_segments`, such that `d_begin_offsets[i]` is the first /// element of the *i*th data segment in `d_keys_*` and /// `d_values_*` BeginOffsetIteratorT d_begin_offsets; /// Random-access input iterator to the sequence of ending offsets of length /// `num_segments`, such that `d_end_offsets[i] - 1` is the last element of /// the *i*th data segment in `d_keys_*` and `d_values_*`. /// If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is /// considered empty. EndOffsetIteratorT d_end_offsets; /// Binary reduction functor ReductionOpT reduction_op; /// The initial value of the reduction InitT init; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; //--------------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------------- /// Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //--------------------------------------------------------------------------- // Chained policy invocation //--------------------------------------------------------------------------- /** * @brief Invocation * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam DeviceSegmentedReduceKernelT * Function type of cub::DeviceSegmentedReduceKernel * * @param[in] segmented_reduce_kernel * Kernel function pointer to parameterization of * cub::DeviceSegmentedReduceKernel */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel) { cudaError error = cudaSuccess; do { // Return if the caller is simply requesting the size of the storage // allocation if (d_temp_storage == NULL) { temp_storage_bytes = 1; return cudaSuccess; } // Init kernel configuration KernelConfig segmented_reduce_config; if (CubDebug( error = segmented_reduce_config .Init( segmented_reduce_kernel))) { break; } // Log device_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread, %d SM occupancy\n", num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, (long long)stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); #endif // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream) .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; // Force kernel code-generation in all compiler passes return InvokePasses( DeviceSegmentedReduceKernel); } //--------------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------------- /** * @brief Internal dispatch routine for computing a device-wide reduction * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream) { typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; if (num_segments <= 0) { return cudaSuccess; } cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchSegmentedReduce dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, init, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, init, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_reduce_by_key.cuh000066400000000000000000000623341434614775400233120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceReduceByKey provides device-wide, parallel operations for * reducing segments of values residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Multi-block reduce-by-key sweep kernel entry point * * @tparam AgentReduceByKeyPolicyT * Parameterized AgentReduceByKeyPolicyT tuning policy type * * @tparam KeysInputIteratorT * Random-access input iterator type for keys * * @tparam UniqueOutputIteratorT * Random-access output iterator type for keys * * @tparam ValuesInputIteratorT * Random-access input iterator type for values * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam NumRunsOutputIteratorT * Output iterator type for recording number of segments encountered * * @tparam ScanTileStateT * Tile status interface type * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets * * @param d_keys_in * Pointer to the input sequence of keys * * @param d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param d_values_in * Pointer to the input sequence of corresponding values * * @param d_aggregates_out * Pointer to the output sequence of value aggregates (one aggregate per run) * * @param d_num_runs_out * Pointer to total number of runs encountered * (i.e., the length of d_unique_out) * * @param tile_state * Tile status interface * * @param start_tile * The starting tile for the current grid * * @param equality_op * KeyT equality operator * * @param reduction_op * ValueT reduction operator * * @param num_items * Total number of items to select from */ template __launch_bounds__(int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceReduceByKeyKernel(KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_state, int start_tile, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items) { // Thread block type for reducing tiles of value segments using AgentReduceByKeyT = AgentReduceByKey; // Shared memory for AgentReduceByKey __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; // Process tiles AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op) .ConsumeRange(num_items, tile_state, start_tile); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * DeviceReduceByKey * * @tparam KeysInputIteratorT * Random-access input iterator type for keys * * @tparam UniqueOutputIteratorT * Random-access output iterator type for keys * * @tparam ValuesInputIteratorT * Random-access input iterator type for values * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam NumRunsOutputIteratorT * Output iterator type for recording number of segments encountered * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets * */ template , cub::detail::value_t>> struct DispatchReduceByKey { //------------------------------------------------------------------------- // Types and constants //------------------------------------------------------------------------- // The input keys type using KeyInputT = cub::detail::value_t; // The output keys type using KeyOutputT = cub::detail::non_void_value_t; // The input values type using ValueInputT = cub::detail::value_t; static constexpr int INIT_KERNEL_THREADS = 128; static constexpr int MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(AccumT)); static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(AccumT); // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; //------------------------------------------------------------------------- // Tuning policies //------------------------------------------------------------------------- /// SM35 struct Policy350 { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; static constexpr int ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; }; /****************************************************************************** * Tuning policies of current PTX compiler pass ******************************************************************************/ using PtxPolicy = Policy350; // "Opaque" policies (whose parameterizations aren't reflected in the type // signature) struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; /****************************************************************************** * Utilities ******************************************************************************/ /** * Initialize kernel dispatch configurations with the policies corresponding * to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs(int /*ptx_version*/, KernelConfig &reduce_by_key_config) { NV_IF_TARGET(NV_IS_DEVICE, ( // We're on the device, so initialize the kernel dispatch // configurations with the current PTX policy reduce_by_key_config.template Init();), ( // We're on the host, so lookup and initialize the kernel // dispatch configurations with the policies that match the // device's PTX version // (There's only one policy right now) reduce_by_key_config .template Init();)); } /** * Kernel kernel dispatch configuration. */ struct KernelConfig { int block_threads; int items_per_thread; int tile_items; template CUB_RUNTIME_FUNCTION __forceinline__ void Init() { block_threads = PolicyT::BLOCK_THREADS; items_per_thread = PolicyT::ITEMS_PER_THREAD; tile_items = block_threads * items_per_thread; } }; //--------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------- /** * @brief Internal dispatch routine for computing a device-wide * reduce-by-key using the specified kernel functions. * * @tparam ScanInitKernelT * Function type of cub::DeviceScanInitKernel * * @tparam ReduceByKeyKernelT * Function type of cub::DeviceReduceByKeyKernelT * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[out] d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param[in] d_values_in * Pointer to the input sequence of corresponding values * * @param[out] d_aggregates_out * Pointer to the output sequence of value aggregates * (one aggregate per run) * * @param[out] d_num_runs_out * Pointer to total number of runs encountered * (i.e., the length of d_unique_out) * * @param[in] equality_op * KeyT equality operator * * @param[in] reduction_op * ValueT reduction operator * * @param[in] num_items * Total number of items to select from * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. * * @param[in] ptx_version * PTX version of dispatch kernels * * @param[in] init_kernel * Kernel function pointer to parameterization of * cub::DeviceScanInitKernel * * @param[in] reduce_by_key_kernel * Kernel function pointer to parameterization of * cub::DeviceReduceByKeyKernel * * @param[in] reduce_by_key_config * Dispatch parameters that match the policy that * `reduce_by_key_kernel` was compiled for */ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, int /*ptx_version*/, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config) { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) { break; } // Number of input tiles int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) { break; // bytes needed for tile status descriptors } // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[1] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Construct the tile status interface ScanTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) { break; } // Log init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long)stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles, d_num_runs_out); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Return if empty problem if (num_items == 0) { break; } // Get SM occupancy for reduce_by_key_kernel int reduce_by_key_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, reduce_by_key_config.block_threads))) { break; } // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) { break; } // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log reduce_by_key_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long)stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); #endif // Invoke reduce_by_key_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, reduce_by_key_config.block_threads, 0, stream) .doit(reduce_by_key_kernel, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, tile_state, start_tile, equality_op, reduction_op, num_items); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op, num_items, stream, ptx_version, init_kernel, reduce_by_key_kernel, reduce_by_key_config); } /** * Internal dispatch routine * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[out] d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param[in] d_values_in * Pointer to the input sequence of corresponding values * * @param[out] d_aggregates_out * Pointer to the output sequence of value aggregates * (one aggregate per run) * * @param[out] d_num_runs_out * Pointer to total number of runs encountered * (i.e., the length of d_unique_out) * * @param[in] equality_op * KeyT equality operator * * @param[in] reduction_op * ValueT reduction operator * * @param[in] num_items * Total number of items to select from * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream) { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Get kernel kernel dispatch configurations KernelConfig reduce_by_key_config; InitConfigs(ptx_version, reduce_by_key_config); // Dispatch if (CubDebug( error = Dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op, num_items, stream, ptx_version, DeviceCompactInitKernel, DeviceReduceByKeyKernel, reduce_by_key_config))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_rle.cuh000066400000000000000000000526471434614775400212710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Select kernel entry point (multi-block) * * Performs functor-based selection if SelectOp functor type != NullType * Otherwise performs flag-based selection if FlagIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) */ template < typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator typename ScanTileStateT, ///< Tile status interface type typename EqualityOpT, ///< T equality operator type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) __global__ void DeviceRleSweepKernel( InputIteratorT d_in, ///< [in] Pointer to input sequence of data items OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) ScanTileStateT tile_status, ///< [in] Tile status interface EqualityOpT equality_op, ///< [in] Equality operator for input items OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) int num_tiles) ///< [in] Total number of tiles for the entire problem { // Thread block type for selecting data from input tiles typedef AgentRle< AgentRlePolicyT, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, EqualityOpT, OffsetT> AgentRleT; // Shared memory for AgentRle __shared__ typename AgentRleT::TempStorage temp_storage; // Process tiles AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( num_tiles, tile_status, d_num_runs_out); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceRle */ template < typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator typename EqualityOpT, ///< T equality operator type typename OffsetT> ///< Signed integer type for global offsets struct DeviceRleDispatch { /****************************************************************************** * Types and constants ******************************************************************************/ // The input value type using T = cub::detail::value_t; // The lengths output value type using LengthT = cub::detail::non_void_value_t; enum { INIT_KERNEL_THREADS = 128, }; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; /****************************************************************************** * Tuning policies ******************************************************************************/ /// SM35 struct Policy350 { enum { NOMINAL_4B_ITEMS_PER_THREAD = 15, ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), }; typedef AgentRlePolicy< 96, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS> RleSweepPolicy; }; /****************************************************************************** * Tuning policies of current PTX compiler pass ******************************************************************************/ typedef Policy350 PtxPolicy; // "Opaque" policies (whose parameterizations aren't reflected in the type signature) struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; /****************************************************************************** * Utilities ******************************************************************************/ /** * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs( int /*ptx_version*/, KernelConfig& device_rle_config) { NV_IF_TARGET(NV_IS_DEVICE, ( // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy device_rle_config.template Init(); ), ( // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version // (There's only one policy right now) device_rle_config.template Init(); )); } /** * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. */ struct KernelConfig { int block_threads; int items_per_thread; BlockLoadAlgorithm load_policy; bool store_warp_time_slicing; BlockScanAlgorithm scan_algorithm; template CUB_RUNTIME_FUNCTION __forceinline__ void Init() { block_threads = AgentRlePolicyT::BLOCK_THREADS; items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; load_policy = AgentRlePolicyT::LOAD_ALGORITHM; store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; } CUB_RUNTIME_FUNCTION __forceinline__ void Print() { printf("%d, %d, %d, %d, %d", block_threads, items_per_thread, load_policy, store_warp_time_slicing, scan_algorithm); } }; /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ /** * Internal dispatch routine for computing a device-wide run-length-encode using the * specified kernel functions. */ template < typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) EqualityOpT equality_op, ///< [in] Equality operator for input items OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. int /*ptx_version*/, ///< [in] PTX version of dispatch kernels DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Number of input tiles int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void* allocations[1] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_status; if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log device_scan_init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream ).doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Return if empty problem if (num_items == 0) { break; } // Get SM occupancy for device_rle_sweep_kernel int device_rle_kernel_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( device_rle_kernel_sm_occupancy, // out device_rle_sweep_kernel, device_rle_config.block_threads))) break; // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_rle_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); #endif // Invoke device_rle_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, device_rle_config.block_threads, 0, stream ).doit(device_rle_sweep_kernel, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, tile_status, equality_op, num_items, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel, KernelConfig device_rle_config) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream, ptx_version, device_scan_init_kernel, device_rle_sweep_kernel, device_rle_config); } /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to input sequence of data items OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) EqualityOpT equality_op, ///< [in] Equality operator for input items OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel kernel dispatch configurations KernelConfig device_rle_config; InitConfigs(ptx_version, device_rle_config); // Dispatch if (CubDebug(error = Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream, ptx_version, DeviceCompactInitKernel, DeviceRleSweepKernel, device_rle_config))) break; } while (0); return error; } CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_scan.cuh000066400000000000000000000543631434614775400214300ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceScan provides device-wide, parallel operations for * computing a prefix scan across a sequence of data items residing * within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Initialization kernel for tile status initialization (multi-block) * * @tparam ScanTileStateT * Tile status interface type * * @param[in] tile_state * Tile status interface * * @param[in] num_tiles * Number of tiles */ template __global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); } /** * Initialization kernel for tile status initialization (multi-block) * * @tparam ScanTileStateT * Tile status interface type * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @param[in] tile_state * Tile status interface * * @param[in] num_tiles * Number of tiles * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of `d_selected_out`) */ template __global__ void DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); // Initialize d_num_selected_out if ((blockIdx.x == 0) && (threadIdx.x == 0)) { *d_num_selected_out = 0; } } /** * @brief Scan kernel entry point (multi-block) * * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading scan inputs \iterator * * @tparam OutputIteratorT * Random-access output iterator type for writing scan outputs \iterator * * @tparam ScanTileStateT * Tile status interface type * * @tparam ScanOpT * Binary scan functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitValueT * Initial value to seed the exclusive scan * (cub::NullType for inclusive scans) * * @tparam OffsetT * Signed integer type for global offsets * * @paramInput d_in * data * * @paramOutput d_out * data * * @paramTile tile_state * status interface * * @paramThe start_tile * starting tile for the current grid * * @paramBinary scan_op * scan functor * * @paramInitial init_value * value to seed the exclusive scan * * @paramTotal num_items * number of scan items for the entire problem */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) __global__ void DeviceScanKernel(InputIteratorT d_in, OutputIteratorT d_out, ScanTileStateT tile_state, int start_tile, ScanOpT scan_op, InitValueT init_value, OffsetT num_items) { using RealInitValueT = typename InitValueT::value_type; typedef typename ChainedPolicyT::ActivePolicy::ScanPolicyT ScanPolicyT; // Thread block type for scanning input tiles typedef AgentScan AgentScanT; // Shared memory for AgentScan __shared__ typename AgentScanT::TempStorage temp_storage; RealInitValueT real_init_value = init_value; // Process tiles AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value) .ConsumeRange(num_items, tile_state, start_tile); } /****************************************************************************** * Policy ******************************************************************************/ template ///< Data type struct DeviceScanPolicy { // For large values, use timesliced loads/stores to fit shared memory. static constexpr bool LargeValues = sizeof(AccumT) > 128; static constexpr BlockLoadAlgorithm ScanTransposedLoad = LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm ScanTransposedStore = LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; /// SM350 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T typedef AgentScanPolicy<128, 12, ///< Threads per block, items per thread AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, BLOCK_SCAN_RAKING> ScanPolicyT; }; /// SM520 struct Policy520 : ChainedPolicy<520, Policy520, Policy350> { // Titan X: 32.47B items/s @ 48M 32-bit T typedef AgentScanPolicy<128, 12, ///< Threads per block, items per thread AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, ScanTransposedStore, BLOCK_SCAN_WARP_SCANS> ScanPolicyT; }; /// SM600 struct Policy600 : ChainedPolicy<600, Policy600, Policy520> { typedef AgentScanPolicy<128, 15, ///< Threads per block, items per thread AccumT, ScanTransposedLoad, LOAD_DEFAULT, ScanTransposedStore, BLOCK_SCAN_WARP_SCANS> ScanPolicyT; }; /// MaxPolicy typedef Policy600 MaxPolicy; }; /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * DeviceScan * * @tparam InputIteratorT * Random-access input iterator type for reading scan inputs \iterator * * @tparam OutputIteratorT * Random-access output iterator type for writing scan outputs \iterator * * @tparam ScanOpT * Binary scan functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitValueT * The init_value element type for ScanOpT (cub::NullType for inclusive scans) * * @tparam OffsetT * Signed integer type for global offsets * */ template ::value, cub::detail::value_t, typename InitValueT::value_type>, cub::detail::value_t>, typename SelectedPolicy = DeviceScanPolicy> struct DispatchScan : SelectedPolicy { //--------------------------------------------------------------------- // Constants and Types //--------------------------------------------------------------------- static constexpr int INIT_KERNEL_THREADS = 128; // The input value type using InputT = cub::detail::value_t; /// Device-accessible allocation of temporary storage. When NULL, the /// required allocation size is written to \p temp_storage_bytes and no work /// is done. void *d_temp_storage; /// Reference to size in bytes of \p d_temp_storage allocation size_t &temp_storage_bytes; /// Iterator to the input sequence of data items InputIteratorT d_in; /// Iterator to the output sequence of data items OutputIteratorT d_out; /// Binary scan functor ScanOpT scan_op; /// Initial value to seed the exclusive scan InitValueT init_value; /// Total number of input items (i.e., the length of \p d_in) OffsetT num_items; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; /** * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Iterator to the input sequence of data items * * @param[out] d_out * Iterator to the output sequence of data items * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ DispatchScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ScanOpT scan_op, InitValueT init_value, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchScan(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ScanOpT scan_op, InitValueT init_value, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { typedef typename ActivePolicyT::ScanPolicyT Policy; typedef typename cub::ScanTileState ScanTileStateT; // `LOAD_LDG` makes in-place execution UB and doesn't lead to better // performance. static_assert(Policy::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG, "The memory consistency model does not apply to texture " "accesses"); cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) { break; } // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) { break; // bytes needed for tile status descriptors } // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[1] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Return if empty problem if (num_items == 0) { break; } // Construct the tile status interface ScanTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) { break; } // Log init_kernel configuration int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long)stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Get SM occupancy for scan_kernel int scan_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out scan_kernel, Policy::BLOCK_THREADS))) { break; } // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) { break; } // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long)stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); #endif // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, Policy::BLOCK_THREADS, 0, stream) .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } } while (0); return error; } template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke() { typedef typename DispatchScan::MaxPolicy MaxPolicyT; typedef typename cub::ScanTileState ScanTileStateT; // Ensure kernels are instantiated. return Invoke(DeviceScanInitKernel, DeviceScanKernel); } /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Iterator to the input sequence of data items * * @param[out] d_out * Iterator to the output sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream) { typedef typename DispatchScan::MaxPolicy MaxPolicyT; cudaError_t error; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchScan dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, scan_op, init_value, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_scan_by_key.cuh000066400000000000000000000561351434614775400227710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file DeviceScan provides device-wide, parallel operations for computing a * prefix scan across a sequence of data items residing within * device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Scan kernel entry point (multi-block) * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam KeysInputIteratorT * Random-access input iterator type * * @tparam ValuesInputIteratorT * Random-access input iterator type * * @tparam ValuesOutputIteratorT * Random-access output iterator type * * @tparam ScanByKeyTileStateT * Tile status interface type * * @tparam EqualityOp * Equality functor type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * * @param d_keys_in * Input keys data * * @param d_keys_prev_in * Predecessor items for each tile * * @param d_values_in * Input values data * * @param d_values_out * Output values data * * @param tile_state * Tile status interface * * @param start_tile * The starting tile for the current grid * * @param equality_op * Binary equality functor * * @param scan_op * Binary scan functor * * @param init_value * Initial value to seed the exclusive scan * * @param num_items * Total number of scan items for the entire problem */ template > __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceScanByKeyKernel(KeysInputIteratorT d_keys_in, KeyT *d_keys_prev_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanByKeyTileStateT tile_state, int start_tile, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items) { using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT; // Thread block type for scanning input tiles using AgentScanByKeyT = AgentScanByKey; // Shared memory for AgentScanByKey __shared__ typename AgentScanByKeyT::TempStorage temp_storage; // Process tiles AgentScanByKeyT(temp_storage, d_keys_in, d_keys_prev_in, d_values_in, d_values_out, equality_op, scan_op, init_value) .ConsumeRange(num_items, tile_state, start_tile); } template __global__ void DeviceScanByKeyInitKernel( ScanTileStateT tile_state, KeysInputIteratorT d_keys_in, cub::detail::value_t *d_keys_prev_in, unsigned items_per_tile, int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); const unsigned tid = threadIdx.x + blockDim.x * blockIdx.x; const unsigned tile_base = tid * items_per_tile; if (tid > 0 && tid < num_tiles) { d_keys_prev_in[tid] = d_keys_in[tile_base - 1]; } } /****************************************************************************** * Policy ******************************************************************************/ template struct DeviceScanByKeyPolicy { using KeyT = cub::detail::value_t; static constexpr size_t MaxInputBytes = (cub::max)(sizeof(KeyT), sizeof(AccumT)); static constexpr size_t CombinedInputBytes = sizeof(KeyT) + sizeof(AccumT); // SM350 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; static constexpr int ITEMS_PER_THREAD = ((MaxInputBytes <= 8) ? 6 : Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes)); using ScanByKeyPolicyT = AgentScanByKeyPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_CA, BLOCK_SCAN_WARP_SCANS, BLOCK_STORE_WARP_TRANSPOSE>; }; // SM520 struct Policy520 : ChainedPolicy<520, Policy520, Policy350> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 9; static constexpr int ITEMS_PER_THREAD = ((MaxInputBytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes)); using ScanByKeyPolicyT = AgentScanByKeyPolicy<256, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_CA, BLOCK_SCAN_WARP_SCANS, BLOCK_STORE_WARP_TRANSPOSE>; }; using MaxPolicy = Policy520; }; /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels * for DeviceScan * * @tparam KeysInputIteratorT * Random-access input iterator type * * @tparam ValuesInputIteratorT * Random-access input iterator type * * @tparam ValuesOutputIteratorT * Random-access output iterator type * * @tparam EqualityOp * Equality functor type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * */ template < typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOp, typename ScanOpT, typename InitValueT, typename OffsetT, typename AccumT = detail::accumulator_t< ScanOpT, cub::detail::conditional_t< std::is_same::value, cub::detail::value_t, InitValueT>, cub::detail::value_t>, typename SelectedPolicy = DeviceScanByKeyPolicy> struct DispatchScanByKey : SelectedPolicy { //--------------------------------------------------------------------- // Constants and Types //--------------------------------------------------------------------- static constexpr int INIT_KERNEL_THREADS = 128; // The input key type using KeyT = cub::detail::value_t; // The input value type using InputT = cub::detail::value_t; /// Device-accessible allocation of temporary storage. When `nullptr`, the /// required allocation size is written to `temp_storage_bytes` and no work /// is done. void *d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t &temp_storage_bytes; /// Iterator to the input sequence of key items KeysInputIteratorT d_keys_in; /// Iterator to the input sequence of value items ValuesInputIteratorT d_values_in; /// Iterator to the input sequence of value items ValuesOutputIteratorT d_values_out; /// Binary equality functor EqualityOp equality_op; /// Binary scan functor ScanOpT scan_op; /// Initial value to seed the exclusive scan InitValueT init_value; /// Total number of input items (i.e., the length of `d_in`) OffsetT num_items; /// CUDA stream to launch kernels within. cudaStream_t stream; int ptx_version; /** * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Iterator to the input sequence of key items * * @param[in] d_values_in * Iterator to the input sequence of value items * * @param[out] d_values_out * Iterator to the input sequence of value items * * @param[in] equality_op * Binary equality functor * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * CUDA stream to launch kernels within. */ CUB_RUNTIME_FUNCTION __forceinline__ DispatchScanByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_values_in(d_values_in) , d_values_out(d_values_out) , equality_op(equality_op) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchScanByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_values_in(d_values_in) , d_values_out(d_values_out) , equality_op(equality_op) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { using Policy = typename ActivePolicyT::ScanByKeyPolicyT; using ScanByKeyTileStateT = ReduceByKeyScanTileState; cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) { break; } // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[2]; if (CubDebug( error = ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) { break; // bytes needed for tile status descriptors } allocation_sizes[1] = sizeof(KeyT) * (num_tiles + 1); // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[2] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Return if empty problem if (num_items == 0) { break; } KeyT *d_keys_prev_in = reinterpret_cast(allocations[1]); // Construct the tile status interface ScanByKeyTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) { break; } // Log init_kernel configuration int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long)stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Get SM occupancy for scan_kernel int scan_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out scan_kernel, Policy::BLOCK_THREADS))) { break; } // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) { break; } // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long)stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); #endif // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, Policy::BLOCK_THREADS, 0, stream) .doit(scan_kernel, d_keys_in, d_keys_prev_in, d_values_in, d_values_out, tile_state, start_tile, equality_op, scan_op, init_value, num_items); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } } while (0); return error; } template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke() { using MaxPolicyT = typename DispatchScanByKey::MaxPolicy; using ScanByKeyTileStateT = ReduceByKeyScanTileState; // Ensure kernels are instantiated. return Invoke( DeviceScanByKeyInitKernel, DeviceScanByKeyKernel); } /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Iterator to the input sequence of key items * * @param[in] d_values_in * Iterator to the input sequence of value items * * @param[out] d_values_out * Iterator to the input sequence of value items * * @param[in] equality_op * Binary equality functor * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * CUDA stream to launch kernels within. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename DispatchScanByKey::MaxPolicy; cudaError_t error; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchScanByKey dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, stream, ptx_version); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_segmented_sort.cuh000066400000000000000000002031321434614775400235140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief Fallback kernel, in case there's not enough segments to * take advantage of partitioning. * * In this case, the sorting method is still selected based on the segment size. * If a single warp can sort the segment, the algorithm will use the sub-warp * merge sort. Otherwise, the algorithm will use the in-shared-memory version of * block radix sort. If data don't fit into shared memory, the algorithm will * use in-global-memory radix sort. * * @param[in] d_keys_in_orig * Input keys buffer * * @param[out] d_keys_out_orig * Output keys buffer * * @param[in,out] d_keys_double_buffer * Double keys buffer * * @param[in] d_values_in_orig * Input values buffer * * @param[out] d_values_out_orig * Output values buffer * * @param[in,out] d_values_double_buffer * Double values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * @p num_segments, such that `d_begin_offsets[i]` is the first element of the * i-th data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of the * i-th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. */ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) __global__ void DeviceSegmentedSortFallbackKernel( const KeyT *d_keys_in_orig, KeyT *d_keys_out_orig, cub::detail::device_double_buffer d_keys_double_buffer, const ValueT *d_values_in_orig, ValueT *d_values_out_orig, cub::detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using MediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT; const unsigned int segment_id = blockIdx.x; OffsetT segment_begin = d_begin_offsets[segment_id]; OffsetT segment_end = d_end_offsets[segment_id]; OffsetT num_items = segment_end - segment_begin; if (num_items <= 0) { return; } using AgentSegmentedRadixSortT = cub::AgentSegmentedRadixSort; using WarpReduceT = cub::WarpReduce; using AgentWarpMergeSortT = AgentSubWarpSort; __shared__ union { typename AgentSegmentedRadixSortT::TempStorage block_sort; typename WarpReduceT::TempStorage warp_reduce; typename AgentWarpMergeSortT::TempStorage medium_warp_sort; } temp_storage; constexpr bool keys_only = std::is_same::value; AgentSegmentedRadixSortT agent(num_items, temp_storage.block_sort); constexpr int begin_bit = 0; constexpr int end_bit = sizeof(KeyT) * 8; constexpr int cacheable_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; d_keys_in_orig += segment_begin; d_keys_out_orig += segment_begin; if (!keys_only) { d_values_in_orig += segment_begin; d_values_out_orig += segment_begin; } if (num_items <= MediumPolicyT::ITEMS_PER_TILE) { // Sort by a single warp if (threadIdx.x < MediumPolicyT::WARP_THREADS) { AgentWarpMergeSortT(temp_storage.medium_warp_sort) .ProcessSegment(num_items, d_keys_in_orig, d_keys_out_orig, d_values_in_orig, d_values_out_orig); } } else if (num_items < cacheable_tile_size) { // Sort by a CTA if data fits into shared memory agent.ProcessSinglePass(begin_bit, end_bit, d_keys_in_orig, d_values_in_orig, d_keys_out_orig, d_values_out_orig); } else { // Sort by a CTA with multiple reads from global memory int current_bit = begin_bit; int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); d_keys_double_buffer = cub::detail::device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); if (!keys_only) { d_values_double_buffer = cub::detail::device_double_buffer( d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin); } agent.ProcessIterative(current_bit, pass_bits, d_keys_in_orig, d_values_in_orig, d_keys_double_buffer.current(), d_values_double_buffer.current()); current_bit += pass_bits; #pragma unroll 1 while (current_bit < end_bit) { pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); CTA_SYNC(); agent.ProcessIterative(current_bit, pass_bits, d_keys_double_buffer.current(), d_values_double_buffer.current(), d_keys_double_buffer.alternate(), d_values_double_buffer.alternate()); d_keys_double_buffer.swap(); d_values_double_buffer.swap(); current_bit += pass_bits; } } } /** * @brief Single kernel for moderate size (less than a few thousand items) * segments. * * This kernel allocates a sub-warp per segment. Therefore, this kernel assigns * a single thread block to multiple segments. Segments fall into two * categories. An architectural warp usually sorts segments in the medium-size * category, while a few threads sort segments in the small-size category. Since * segments are partitioned, we know the last thread block index assigned to * sort medium-size segments. A particular thread block can check this number to * find out which category it was assigned to sort. In both cases, the * merge sort is used. * * @param[in] small_segments * Number of segments that can be sorted by a warp part * * @param[in] medium_segments * Number of segments that can be sorted by a warp * * @param[in] medium_blocks * Number of CTAs assigned to process medium segments * * @param[in] d_small_segments_indices * Small segments mapping of length @p small_segments, such that * `d_small_segments_indices[i]` is the input segment index * * @param[in] d_medium_segments_indices * Medium segments mapping of length @p medium_segments, such that * `d_medium_segments_indices[i]` is the input segment index * * @param[in] d_keys_in_orig * Input keys buffer * * @param[out] d_keys_out_orig * Output keys buffer * * @param[in] d_values_in_orig * Input values buffer * * @param[out] d_values_out_orig * Output values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * @p num_segments, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of the * ith data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. */ template __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) __global__ void DeviceSegmentedSortKernelSmall( unsigned int small_segments, unsigned int medium_segments, unsigned int medium_blocks, const unsigned int *d_small_segments_indices, const unsigned int *d_medium_segments_indices, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { const unsigned int tid = threadIdx.x; const unsigned int bid = blockIdx.x; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; using MediumPolicyT = typename SmallAndMediumPolicyT::MediumPolicyT; using SmallPolicyT = typename SmallAndMediumPolicyT::SmallPolicyT; constexpr int threads_per_medium_segment = MediumPolicyT::WARP_THREADS; constexpr int threads_per_small_segment = SmallPolicyT::WARP_THREADS; using MediumAgentWarpMergeSortT = AgentSubWarpSort; using SmallAgentWarpMergeSortT = AgentSubWarpSort; constexpr auto segments_per_medium_block = static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); constexpr auto segments_per_small_block = static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); __shared__ union { typename MediumAgentWarpMergeSortT::TempStorage medium_storage[segments_per_medium_block]; typename SmallAgentWarpMergeSortT::TempStorage small_storage[segments_per_small_block]; } temp_storage; if (bid < medium_blocks) { const unsigned int sid_within_block = tid / threads_per_medium_segment; const unsigned int medium_segment_id = bid * segments_per_medium_block + sid_within_block; if (medium_segment_id < medium_segments) { const unsigned int global_segment_id = d_medium_segments_indices[medium_segment_id]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; const OffsetT num_items = segment_end - segment_begin; MediumAgentWarpMergeSortT(temp_storage.medium_storage[sid_within_block]) .ProcessSegment(num_items, d_keys_in + segment_begin, d_keys_out + segment_begin, d_values_in + segment_begin, d_values_out + segment_begin); } } else { const unsigned int sid_within_block = tid / threads_per_small_segment; const unsigned int small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block; if (small_segment_id < small_segments) { const unsigned int global_segment_id = d_small_segments_indices[small_segment_id]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; const OffsetT num_items = segment_end - segment_begin; SmallAgentWarpMergeSortT(temp_storage.small_storage[sid_within_block]) .ProcessSegment(num_items, d_keys_in + segment_begin, d_keys_out + segment_begin, d_values_in + segment_begin, d_values_out + segment_begin); } } } /** * @brief Single kernel for large size (more than a few thousand items) segments. * * @param[in] d_keys_in_orig * Input keys buffer * * @param[out] d_keys_out_orig * Output keys buffer * * @param[in] d_values_in_orig * Input values buffer * * @param[out] d_values_out_orig * Output values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * @p num_segments, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that `d_end_offsets[i]-1` is the last element of the * ith data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. */ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) __global__ void DeviceSegmentedSortKernelLarge( const unsigned int *d_segments_indices, const KeyT *d_keys_in_orig, KeyT *d_keys_out_orig, cub::detail::device_double_buffer d_keys_double_buffer, const ValueT *d_values_in_orig, ValueT *d_values_out_orig, cub::detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; using AgentSegmentedRadixSortT = cub::AgentSegmentedRadixSort; __shared__ typename AgentSegmentedRadixSortT::TempStorage storage; const unsigned int bid = blockIdx.x; constexpr int begin_bit = 0; constexpr int end_bit = sizeof(KeyT) * 8; const unsigned int global_segment_id = d_segments_indices[bid]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; const OffsetT num_items = segment_end - segment_begin; constexpr bool keys_only = std::is_same::value; AgentSegmentedRadixSortT agent(num_items, storage); d_keys_in_orig += segment_begin; d_keys_out_orig += segment_begin; if (!keys_only) { d_values_in_orig += segment_begin; d_values_out_orig += segment_begin; } if (num_items < small_tile_size) { // Sort in shared memory if the segment fits into it agent.ProcessSinglePass(begin_bit, end_bit, d_keys_in_orig, d_values_in_orig, d_keys_out_orig, d_values_out_orig); } else { // Sort reading global memory multiple times int current_bit = begin_bit; int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); d_keys_double_buffer = cub::detail::device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); if (!keys_only) { d_values_double_buffer = cub::detail::device_double_buffer( d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin); } agent.ProcessIterative(current_bit, pass_bits, d_keys_in_orig, d_values_in_orig, d_keys_double_buffer.current(), d_values_double_buffer.current()); current_bit += pass_bits; #pragma unroll 1 while (current_bit < end_bit) { pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); CTA_SYNC(); agent.ProcessIterative(current_bit, pass_bits, d_keys_double_buffer.current(), d_values_double_buffer.current(), d_keys_double_buffer.alternate(), d_values_double_buffer.alternate()); d_keys_double_buffer.swap(); d_values_double_buffer.swap(); current_bit += pass_bits; } } } /* * Continuation is called after the partitioning stage. It launches kernels * to sort large and small segments using the partitioning results. Separation * of this stage is required to eliminate device-side synchronization in * the CDP mode. */ template CUB_RUNTIME_FUNCTION cudaError_t DeviceSegmentedSortContinuation( LargeKernelT large_kernel, SmallKernelT small_kernel, int num_segments, KeyT *d_current_keys, KeyT *d_final_keys, detail::device_double_buffer d_keys_double_buffer, ValueT *d_current_values, ValueT *d_final_values, detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, unsigned int *group_sizes, unsigned int *large_and_medium_segments_indices, unsigned int *small_segments_indices, cudaStream_t stream) { cudaError error = cudaSuccess; const unsigned int large_segments = group_sizes[0]; if (large_segments > 0) { // One CTA per segment const unsigned int blocks_in_grid = large_segments; #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n", static_cast(blocks_in_grid), LargeSegmentPolicyT::BLOCK_THREADS, (long long)stream); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) .doit(large_kernel, large_and_medium_segments_indices, d_current_keys, d_final_keys, d_keys_double_buffer, d_current_values, d_final_values, d_values_double_buffer, d_begin_offsets, d_end_offsets); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { return error; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { return error; } } const unsigned int small_segments = group_sizes[1]; const unsigned int medium_segments = static_cast(num_segments) - (large_segments + small_segments); const unsigned int small_blocks = DivideAndRoundUp(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); const unsigned int medium_blocks = DivideAndRoundUp(medium_segments, SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); const unsigned int small_and_medium_blocks_in_grid = small_blocks + medium_blocks; if (small_and_medium_blocks_in_grid) { #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n", static_cast(small_and_medium_blocks_in_grid), SmallAndMediumPolicyT::BLOCK_THREADS, (long long)stream); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream) .doit(small_kernel, small_segments, medium_segments, medium_blocks, small_segments_indices, large_and_medium_segments_indices + num_segments - medium_segments, d_current_keys, d_final_keys, d_current_values, d_final_values, d_begin_offsets, d_end_offsets); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { return error; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { return error; } } return error; } #ifdef CUB_RDC_ENABLED /* * Continuation kernel is used only in the CDP mode. It's used to * launch DeviceSegmentedSortContinuation as a separate kernel. */ template __launch_bounds__(1) __global__ void DeviceSegmentedSortContinuationKernel( LargeKernelT large_kernel, SmallKernelT small_kernel, int num_segments, KeyT *d_current_keys, KeyT *d_final_keys, detail::device_double_buffer d_keys_double_buffer, ValueT *d_current_values, ValueT *d_final_values, detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, unsigned int *group_sizes, unsigned int *large_and_medium_segments_indices, unsigned int *small_segments_indices) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; // In case of CDP: // 1. each CTA has a different main stream // 2. all streams are non-blocking // 3. child grid always completes before the parent grid // 4. streams can be used only from the CTA in which they were created // 5. streams created on the host cannot be used on the device // // Due to (4, 5), we can't pass the user-provided stream in the continuation. // Due to (1, 2, 3) it's safe to pass the main stream. cudaError_t error = DeviceSegmentedSortContinuation( large_kernel, small_kernel, num_segments, d_current_keys, d_final_keys, d_keys_double_buffer, d_current_values, d_final_values, d_values_double_buffer, d_begin_offsets, d_end_offsets, group_sizes, large_and_medium_segments_indices, small_segments_indices, 0); // always launching on the main stream (see motivation above) CubDebug(error); } #endif // CUB_RDC_ENABLED template struct DeviceSegmentedSortPolicy { using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; constexpr static int KEYS_ONLY = std::is_same::value; //---------------------------------------------------------------------------- // Architecture-specific tuning policies //---------------------------------------------------------------------------- struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { constexpr static int BLOCK_THREADS = 128; constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; constexpr static int PARTITIONING_THRESHOLD = 300; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(5); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(5); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy500 : ChainedPolicy<500, Policy500, Policy350> { constexpr static int BLOCK_THREADS = 256; constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; constexpr static int PARTITIONING_THRESHOLD = 300; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(7); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(7); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { constexpr static int BLOCK_THREADS = 256; constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; constexpr static int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(9); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy610 : ChainedPolicy<610, Policy610, Policy600> { constexpr static int BLOCK_THREADS = 256; constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; constexpr static int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(9); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy620 : ChainedPolicy<620, Policy620, Policy610> { constexpr static int BLOCK_THREADS = 256; constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 5 : 4; constexpr static int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(9); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy700 : ChainedPolicy<700, Policy700, Policy620> { constexpr static int BLOCK_THREADS = 256; constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; constexpr static int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(7); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(KEYS_ONLY ? 11 : 7); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 8), // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy800 : ChainedPolicy<800, Policy800, Policy700> { constexpr static int BLOCK_THREADS = 256; constexpr static int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy 1) ? 6 : 4>; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(KEYS_ONLY ? 7 : 11); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 2), // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy860 : ChainedPolicy<860, Policy860, Policy800> { constexpr static int BLOCK_THREADS = 256; constexpr static int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy 1) ? 6 : 4>; constexpr static bool LARGE_ITEMS = sizeof(DominantT) > 4; constexpr static int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(LARGE_ITEMS ? 7 : 9); constexpr static int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(LARGE_ITEMS ? 9 : 7); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<(LARGE_ITEMS ? 8 : 2), // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_LDG>, // Medium policy cub::AgentSubWarpMergeSortPolicy<16, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_LDG>>; }; /// MaxPolicy using MaxPolicy = Policy860; }; template > struct DispatchSegmentedSort : SelectedPolicy { static constexpr int KEYS_ONLY = std::is_same::value; struct LargeSegmentsSelectorT { OffsetT value{}; BeginOffsetIteratorT d_offset_begin{}; EndOffsetIteratorT d_offset_end{}; __host__ __device__ __forceinline__ LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end) : value(value) , d_offset_begin(d_offset_begin) , d_offset_end(d_offset_end) {} __host__ __device__ __forceinline__ bool operator()(unsigned int segment_id) const { const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id]; return segment_size > value; } }; struct SmallSegmentsSelectorT { OffsetT value{}; BeginOffsetIteratorT d_offset_begin{}; EndOffsetIteratorT d_offset_end{}; __host__ __device__ __forceinline__ SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end) : value(value) , d_offset_begin(d_offset_begin) , d_offset_end(d_offset_end) {} __host__ __device__ __forceinline__ bool operator()(unsigned int segment_id) const { const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id]; return segment_size < value; } }; // Partition selects large and small groups. The middle group is not selected. constexpr static std::size_t num_selected_groups = 2; /** * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to @p temp_storage_bytes and no work * is done. */ void *d_temp_storage; /// Reference to size in bytes of @p d_temp_storage allocation std::size_t &temp_storage_bytes; /** * Double-buffer whose current buffer contains the unsorted input keys and, * upon return, is updated to point to the sorted output keys */ DoubleBuffer &d_keys; /** * Double-buffer whose current buffer contains the unsorted input values and, * upon return, is updated to point to the sorted output values */ DoubleBuffer &d_values; /// Number of items to sort OffsetT num_items; /// The number of segments that comprise the sorting data int num_segments; /** * Random-access input iterator to the sequence of beginning offsets of length * @p num_segments, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` */ BeginOffsetIteratorT d_begin_offsets; /** * Random-access input iterator to the sequence of ending offsets of length * @p num_segments, such that d_end_offsets[i]-1 is the last element * of the ith data segment in `d_keys_*` and * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, * the ith is considered empty. */ EndOffsetIteratorT d_end_offsets; /// Whether is okay to overwrite source buffers bool is_overwrite_okay; /// CUDA stream to launch kernels within. cudaStream_t stream; CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedSort(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , is_overwrite_okay(is_overwrite_okay) , stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedSort(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , is_overwrite_okay(is_overwrite_okay) , stream(stream) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; static_assert( LargeSegmentPolicyT::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG, "The memory consistency model does not apply to texture accesses"); static_assert( KEYS_ONLY || LargeSegmentPolicyT::LOAD_ALGORITHM != BLOCK_LOAD_STRIPED || SmallAndMediumPolicyT::MediumPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED || SmallAndMediumPolicyT::SmallPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED, "Striped load will make this algorithm unstable"); static_assert( SmallAndMediumPolicyT::MediumPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED || SmallAndMediumPolicyT::SmallPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED, "Striped stores will produce unsorted results"); constexpr int radix_bits = LargeSegmentPolicyT::RADIX_BITS; cudaError error = cudaSuccess; do { //------------------------------------------------------------------------ // Prepare temporary storage layout //------------------------------------------------------------------------ const bool partition_segments = num_segments > ActivePolicyT::PARTITIONING_THRESHOLD; cub::detail::temporary_storage::layout<5> temporary_storage_layout; auto keys_slot = temporary_storage_layout.get_slot(0); auto values_slot = temporary_storage_layout.get_slot(1); auto large_and_medium_partitioning_slot = temporary_storage_layout.get_slot(2); auto small_partitioning_slot = temporary_storage_layout.get_slot(3); auto group_sizes_slot = temporary_storage_layout.get_slot(4); auto keys_allocation = keys_slot->create_alias(); auto values_allocation = values_slot->create_alias(); if (!is_overwrite_okay) { keys_allocation.grow(num_items); if (!KEYS_ONLY) { values_allocation.grow(num_items); } } auto large_and_medium_segments_indices = large_and_medium_partitioning_slot->create_alias(); auto small_segments_indices = small_partitioning_slot->create_alias(); auto group_sizes = group_sizes_slot->create_alias(); std::size_t three_way_partition_temp_storage_bytes {}; LargeSegmentsSelectorT large_segments_selector( SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE, d_begin_offsets, d_end_offsets); SmallSegmentsSelectorT small_segments_selector( SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE + 1, d_begin_offsets, d_end_offsets); auto device_partition_temp_storage = keys_slot->create_alias(); if (partition_segments) { large_and_medium_segments_indices.grow(num_segments); small_segments_indices.grow(num_segments); group_sizes.grow(num_selected_groups); auto medium_indices_iterator = THRUST_NS_QUALIFIER::make_reverse_iterator( large_and_medium_segments_indices.get()); cub::DevicePartition::If( nullptr, three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), large_and_medium_segments_indices.get(), small_segments_indices.get(), medium_indices_iterator, group_sizes.get(), num_segments, large_segments_selector, small_segments_selector, stream); device_partition_temp_storage.grow( three_way_partition_temp_storage_bytes); } if (d_temp_storage == nullptr) { temp_storage_bytes = temporary_storage_layout.get_size(); // Return if the caller is simply requesting the size of the storage // allocation break; } if (num_items == 0 || num_segments == 0) { break; } if (CubDebug( error = temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes))) { break; } //------------------------------------------------------------------------ // Sort //------------------------------------------------------------------------ const bool is_num_passes_odd = GetNumPasses(radix_bits) & 1; /** * This algorithm sorts segments that don't fit into shared memory with * the in-global-memory radix sort. Radix sort splits key representation * into multiple "digits". Each digit is RADIX_BITS wide. The algorithm * iterates over these digits. Each of these iterations consists of a * couple of stages. The first stage computes a histogram for a current * digit in each segment key. This histogram helps to determine the * starting position of the keys group with a similar digit. * For example: * keys_digits = [ 1, 0, 0, 1 ] * digit_prefix = [ 0, 2 ] * The second stage checks the keys again and increments the prefix to * determine the final position of the key: * * expression | key | idx | result * ----------------------------------- | ----- | ------- | -------------- * result[prefix[keys[0]]++] = keys[0] | 1 | 2 | [ ?, ?, 1, ? ] * result[prefix[keys[1]]++] = keys[0] | 0 | 0 | [ 0, ?, 1, ? ] * result[prefix[keys[2]]++] = keys[0] | 0 | 1 | [ 0, 0, 1, ? ] * result[prefix[keys[3]]++] = keys[0] | 1 | 3 | [ 0, 0, 1, 1 ] * * If the resulting memory is aliased to the input one, we'll face the * following issues: * * input | key | idx | result/input | issue * -------------- | ----- | ------- | ---------------- | ---------------- * [ 1, 0, 0, 1 ] | 1 | 2 | [ 1, 0, 1, 1 ] | overwrite keys[2] * [ 1, 0, 1, 1 ] | 0 | 0 | [ 0, 0, 1, 1 ] | * [ 0, 0, 1, 1 ] | 1 | 3 | [ 0, 0, 1, 1 ] | extra key * [ 0, 0, 1, 1 ] | 1 | 4 | [ 0, 0, 1, 1 ] 1 | OOB access * * To avoid these issues, we have to use extra memory. The extra memory * holds temporary storage for writing intermediate results of each stage. * Since we iterate over digits in keys, we potentially need: * `sizeof(KeyT) * num_items * DivideAndRoundUp(sizeof(KeyT),RADIX_BITS)` * auxiliary memory bytes. To reduce the auxiliary memory storage * requirements, the algorithm relies on a double buffer facility. The * idea behind it is in swapping destination and source buffers at each * iteration. This way, we can use only two buffers. One of these buffers * can be the final algorithm output destination. Therefore, only one * auxiliary array is needed. Depending on the number of iterations, we * can initialize the double buffer so that the algorithm output array * will match the double buffer result one at the final iteration. * A user can provide this algorithm with a double buffer straightaway to * further reduce the auxiliary memory requirements. `is_overwrite_okay` * indicates this use case. */ detail::device_double_buffer d_keys_double_buffer( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : keys_allocation.get(), (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? keys_allocation.get() : d_keys.Alternate()); detail::device_double_buffer d_values_double_buffer( (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : values_allocation.get(), (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? values_allocation.get() : d_values.Alternate()); if (partition_segments) { // Partition input segments into size groups and assign specialized // kernels for each of them. error = SortWithPartitioning( DeviceSegmentedSortKernelLarge, DeviceSegmentedSortKernelSmall, three_way_partition_temp_storage_bytes, d_keys_double_buffer, d_values_double_buffer, large_segments_selector, small_segments_selector, device_partition_temp_storage, large_and_medium_segments_indices, small_segments_indices, group_sizes); } else { // If there are not enough segments, there's no reason to spend time // on extra partitioning steps. error = SortWithoutPartitioning( DeviceSegmentedSortFallbackKernel, d_keys_double_buffer, d_values_double_buffer); } d_keys.selector = GetFinalSelector(d_keys.selector, radix_bits); d_values.selector = GetFinalSelector(d_values.selector, radix_bits); } while (false); return error; } CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream) { using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchSegmentedSort dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (false); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } private: CUB_RUNTIME_FUNCTION __forceinline__ int GetNumPasses(int radix_bits) { const int byte_size = 8; const int num_bits = sizeof(KeyT) * byte_size; const int num_passes = DivideAndRoundUp(num_bits, radix_bits); return num_passes; } CUB_RUNTIME_FUNCTION __forceinline__ int GetFinalSelector(int selector, int radix_bits) { // Sorted data always ends up in the other vector if (!is_overwrite_okay) { return (selector + 1) & 1; } return (selector + GetNumPasses(radix_bits)) & 1; } template CUB_RUNTIME_FUNCTION __forceinline__ T* GetFinalOutput(int radix_bits, DoubleBuffer &buffer) { const int final_selector = GetFinalSelector(buffer.selector, radix_bits); return buffer.d_buffers[final_selector]; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SortWithPartitioning( LargeKernelT large_kernel, SmallKernelT small_kernel, std::size_t three_way_partition_temp_storage_bytes, cub::detail::device_double_buffer &d_keys_double_buffer, cub::detail::device_double_buffer &d_values_double_buffer, LargeSegmentsSelectorT &large_segments_selector, SmallSegmentsSelectorT &small_segments_selector, cub::detail::temporary_storage::alias &device_partition_temp_storage, cub::detail::temporary_storage::alias &large_and_medium_segments_indices, cub::detail::temporary_storage::alias &small_segments_indices, cub::detail::temporary_storage::alias &group_sizes) { cudaError_t error = cudaSuccess; auto medium_indices_iterator = THRUST_NS_QUALIFIER::make_reverse_iterator( large_and_medium_segments_indices.get() + num_segments); error = cub::DevicePartition::If( device_partition_temp_storage.get(), three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), large_and_medium_segments_indices.get(), small_segments_indices.get(), medium_indices_iterator, group_sizes.get(), num_segments, large_segments_selector, small_segments_selector, stream); if (CubDebug(error)) { return error; } // The device path is only used (and only compiles) when CDP is enabled. // It's defined in a macro since we can't put `#ifdef`s inside of // `NV_IF_TARGET`. #ifndef CUB_RDC_ENABLED #define CUB_TEMP_DEVICE_CODE #else // CUB_RDC_ENABLED #define CUB_TEMP_DEVICE_CODE \ using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; \ error = \ THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ .doit(DeviceSegmentedSortContinuationKernel, \ large_kernel, \ small_kernel, \ num_segments, \ d_keys.Current(), \ GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ d_keys_double_buffer, \ d_values.Current(), \ GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ d_values_double_buffer, \ d_begin_offsets, \ d_end_offsets, \ group_sizes.get(), \ large_and_medium_segments_indices.get(), \ small_segments_indices.get()); \ \ if (CubDebug(error)) \ { \ return error; \ } \ \ error = detail::DebugSyncStream(stream); \ if (CubDebug(error)) \ { \ return error; \ } #endif // CUB_RDC_ENABLED // Clang format mangles some of this NV_IF_TARGET block // clang-format off NV_IF_TARGET( NV_IS_HOST, ( unsigned int h_group_sizes[num_selected_groups]; if (CubDebug(error = cudaMemcpyAsync(h_group_sizes, group_sizes.get(), num_selected_groups * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream))) { return error; } if (CubDebug(error = SyncStream(stream))) { return error; } error = DeviceSegmentedSortContinuation( large_kernel, small_kernel, num_segments, d_keys.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), d_keys_double_buffer, d_values.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), d_values_double_buffer, d_begin_offsets, d_end_offsets, h_group_sizes, large_and_medium_segments_indices.get(), small_segments_indices.get(), stream);), // NV_IS_DEVICE: (CUB_TEMP_DEVICE_CODE)); // clang-format on #undef CUB_TEMP_DEVICE_CODE return error; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SortWithoutPartitioning( FallbackKernelT fallback_kernel, cub::detail::device_double_buffer &d_keys_double_buffer, cub::detail::device_double_buffer &d_values_double_buffer) { cudaError_t error = cudaSuccess; const auto blocks_in_grid = static_cast(num_segments); const auto threads_in_block = static_cast(LargeSegmentPolicyT::BLOCK_THREADS); // Log kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, " "0, %lld>>>(), %d items per thread, bit_grain %d\n", blocks_in_grid, threads_in_block, (long long)stream, LargeSegmentPolicyT::ITEMS_PER_THREAD, LargeSegmentPolicyT::RADIX_BITS); #endif // Invoke fallback kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(fallback_kernel, d_keys.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), d_keys_double_buffer, d_values.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), d_values_double_buffer, d_begin_offsets, d_end_offsets); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { return error; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { return error; } return error; } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_select_if.cuh000066400000000000000000000551621434614775400224370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Select kernel entry point (multi-block) * * Performs functor-based selection if SelectOpT functor type != NullType * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) */ template < typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type typename InputIteratorT, ///< Random-access input iterator type for reading input items typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected typename ScanTileStateT, ///< Tile status interface type typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) typename OffsetT, ///< Signed integer type for global offsets bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output __launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) __global__ void DeviceSelectSweepKernel( InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) ScanTileStateT tile_status, ///< [in] Tile status interface SelectOpT select_op, ///< [in] Selection operator EqualityOpT equality_op, ///< [in] Equality operator OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) int num_tiles) ///< [in] Total number of tiles for the entire problem { // Thread block type for selecting data from input tiles typedef AgentSelectIf< AgentSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS> AgentSelectIfT; // Shared memory for AgentSelectIf __shared__ typename AgentSelectIfT::TempStorage temp_storage; // Process tiles AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( num_tiles, tile_status, d_num_selected_out); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect */ template < typename InputIteratorT, ///< Random-access input iterator type for reading input items typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) typename OffsetT, ///< Signed integer type for global offsets bool KEEP_REJECTS, ///< Whether or not we push rejected items to the back of the output bool MayAlias = false> struct DispatchSelectIf { /****************************************************************************** * Types and constants ******************************************************************************/ // The input value type using InputT = cub::detail::value_t; // The flag value type using FlagT = cub::detail::value_t; enum { INIT_KERNEL_THREADS = 128, }; // Tile status descriptor interface type typedef ScanTileState ScanTileStateT; /****************************************************************************** * Tuning policies ******************************************************************************/ /// SM35 struct Policy350 { enum { NOMINAL_4B_ITEMS_PER_THREAD = 10, ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))), }; typedef AgentSelectIfPolicy< 128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, MayAlias ? LOAD_CA : LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SelectIfPolicyT; }; /****************************************************************************** * Tuning policies of current PTX compiler pass ******************************************************************************/ typedef Policy350 PtxPolicy; // "Opaque" policies (whose parameterizations aren't reflected in the type signature) struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; /****************************************************************************** * Utilities ******************************************************************************/ /** * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs( int ptx_version, KernelConfig &select_if_config) { NV_IF_TARGET(NV_IS_DEVICE, ( (void)ptx_version; // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy select_if_config.template Init(); ), ( // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version // (There's only one policy right now) (void)ptx_version; select_if_config.template Init(); )); } /** * Kernel kernel dispatch configuration. */ struct KernelConfig { int block_threads; int items_per_thread; int tile_items; template CUB_RUNTIME_FUNCTION __forceinline__ void Init() { block_threads = PolicyT::BLOCK_THREADS; items_per_thread = PolicyT::ITEMS_PER_THREAD; tile_items = block_threads * items_per_thread; } }; /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ /** * Internal dispatch routine for computing a device-wide selection using the * specified kernel functions. */ template < typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) SelectOpT select_op, ///< [in] Selection operator EqualityOpT equality_op, ///< [in] Equality operator OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. int /*ptx_version*/, ///< [in] PTX version of dispatch kernels ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Number of input tiles int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void* allocations[1] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_status; if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log scan_init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke scan_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream ).doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Return if empty problem if (num_items == 0) break; // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(range_select_sm_occupancy, // out select_if_kernel, select_if_config.block_threads))) { break; } _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, " "%lld>>>(), %d items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long)stream, select_if_config.items_per_thread, range_select_sm_occupancy); } #endif // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, select_if_config.block_threads, 0, stream ).doit(select_if_kernel, d_in, d_flags, d_selected_out, d_num_selected_out, tile_status, select_op, equality_op, num_items, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, ScanInitKernelPtrT scan_init_kernel, SelectIfKernelPtrT select_if_kernel, KernelConfig select_if_config) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_selected_out, d_num_selected_out, select_op, equality_op, num_items, stream, ptx_version, scan_init_kernel, select_if_kernel, select_if_config); } /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) SelectOpT select_op, ///< [in] Selection operator EqualityOpT equality_op, ///< [in] Equality operator OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel kernel dispatch configurations KernelConfig select_if_config; InitConfigs(ptx_version, select_if_config); // Dispatch if (CubDebug(error = Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_selected_out, d_num_selected_out, select_op, equality_op, num_items, stream, ptx_version, DeviceCompactInitKernel, DeviceSelectSweepKernel, select_if_config))) break; } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_selected_out, d_num_selected_out, select_op, equality_op, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_spmv_orig.cuh000066400000000000000000001000471434614775400225000ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * SpMV kernel entry points *****************************************************************************/ /** * Spmv search kernel. Identifies merge path starting coordinates for each tile. */ template < typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type typename ValueT, ///< Matrix and vector value type typename OffsetT> ///< Signed integer type for sequence offsets __global__ void DeviceSpmv1ColKernel( SpmvParams spmv_params) ///< [in] SpMV input parameter bundle { typedef CacheModifiedInputIterator< AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT> VectorValueIteratorT; VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; if (row_idx < spmv_params.num_rows) { OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; ValueT value = 0.0; if (end_nonzero_idx != nonzero_idx) { value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; } spmv_params.d_vector_y[row_idx] = value; } } /** * Spmv search kernel. Identifies merge path starting coordinates for each tile. */ template < typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type typename OffsetT, ///< Signed integer type for sequence offsets typename CoordinateT, ///< Merge path coordinate type typename SpmvParamsT> ///< SpmvParams type __global__ void DeviceSpmvSearchKernel( int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle { /// Constants enum { BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; typedef CacheModifiedInputIterator< SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT> RowOffsetsSearchIteratorT; // Find the starting coordinate for all tiles (plus the end coordinate of the last one) int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; if (tile_idx < num_merge_tiles + 1) { OffsetT diagonal = (tile_idx * TILE_ITEMS); CoordinateT tile_coordinate; CountingInputIterator nonzero_indices(0); // Search the merge path MergePathSearch( diagonal, RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), nonzero_indices, spmv_params.num_rows, spmv_params.num_nonzeros, tile_coordinate); // Output starting offset d_tile_coordinates[tile_idx] = tile_coordinate; } } /** * Spmv agent entry point */ template < typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type typename ScanTileStateT, ///< Tile status interface type typename ValueT, ///< Matrix and vector value type typename OffsetT, ///< Signed integer type for sequence offsets typename CoordinateT, ///< Merge path coordinate type bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 bool HAS_BETA> ///< Whether the input parameter Beta is 0 __launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) __global__ void DeviceSpmvKernel( SpmvParams spmv_params, ///< [in] SpMV input parameter bundle CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block int num_tiles, ///< [in] Number of merge tiles ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) { // Spmv agent type specialization typedef AgentSpmv< SpmvPolicyT, ValueT, OffsetT, HAS_ALPHA, HAS_BETA> AgentSpmvT; // Shared memory for AgentSpmv __shared__ typename AgentSpmvT::TempStorage temp_storage; AgentSpmvT(temp_storage, spmv_params).ConsumeTile( d_tile_coordinates, d_tile_carry_pairs, num_tiles); // Initialize fixup tile status tile_state.InitializeStatus(num_segment_fixup_tiles); } /** * Multi-block reduce-by-key sweep kernel entry point */ template < typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type typename PairsInputIteratorT, ///< Random-access input iterator type for keys typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values typename OffsetT, ///< Signed integer type for global offsets typename ScanTileStateT> ///< Tile status interface type __launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) __global__ void DeviceSegmentFixupKernel( PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates OffsetT num_items, ///< [in] Total number of items to select from int num_tiles, ///< [in] Total number of tiles for the entire problem ScanTileStateT tile_state) ///< [in] Tile status interface { // Thread block type for reducing tiles of value segments typedef AgentSegmentFixup< AgentSegmentFixupPolicyT, PairsInputIteratorT, AggregatesOutputIteratorT, cub::Equality, cub::Sum, OffsetT> AgentSegmentFixupT; // Shared memory for AgentSegmentFixup __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; // Process tiles AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( num_items, num_tiles, tile_state); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv */ template < typename ValueT, ///< Matrix and vector value type typename OffsetT> ///< Signed integer type for global offsets struct DispatchSpmv { //--------------------------------------------------------------------- // Constants and Types //--------------------------------------------------------------------- enum { INIT_KERNEL_THREADS = 128 }; // SpmvParams bundle type typedef SpmvParams SpmvParamsT; // 2D merge path coordinate type typedef typename CubVector::Type CoordinateT; // Tile status descriptor interface type typedef ReduceByKeyScanTileState ScanTileStateT; // Tuple type for scanning (pairs accumulated segment-value with segment-index) typedef KeyValuePair KeyValuePairT; //--------------------------------------------------------------------- // Tuning policies //--------------------------------------------------------------------- /// SM35 struct Policy350 { typedef AgentSpmvPolicy< (sizeof(ValueT) > 4) ? 96 : 128, (sizeof(ValueT) > 4) ? 4 : 7, LOAD_LDG, LOAD_CA, LOAD_LDG, LOAD_LDG, LOAD_LDG, (sizeof(ValueT) > 4) ? true : false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; typedef AgentSegmentFixupPolicy< 128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SegmentFixupPolicyT; }; /// SM37 struct Policy370 { typedef AgentSpmvPolicy< (sizeof(ValueT) > 4) ? 128 : 128, (sizeof(ValueT) > 4) ? 9 : 14, LOAD_LDG, LOAD_CA, LOAD_LDG, LOAD_LDG, LOAD_LDG, false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; typedef AgentSegmentFixupPolicy< 128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SegmentFixupPolicyT; }; /// SM50 struct Policy500 { typedef AgentSpmvPolicy< (sizeof(ValueT) > 4) ? 64 : 128, (sizeof(ValueT) > 4) ? 6 : 7, LOAD_LDG, LOAD_DEFAULT, (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, LOAD_LDG, (sizeof(ValueT) > 4) ? true : false, (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> SpmvPolicyT; typedef AgentSegmentFixupPolicy< 128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE> SegmentFixupPolicyT; }; /// SM60 struct Policy600 { typedef AgentSpmvPolicy< (sizeof(ValueT) > 4) ? 64 : 128, (sizeof(ValueT) > 4) ? 5 : 7, LOAD_DEFAULT, LOAD_DEFAULT, LOAD_DEFAULT, LOAD_DEFAULT, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; typedef AgentSegmentFixupPolicy< 128, 3, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SegmentFixupPolicyT; }; //--------------------------------------------------------------------- // Tuning policies of current PTX compiler pass //--------------------------------------------------------------------- #if (CUB_PTX_ARCH >= 600) typedef Policy600 PtxPolicy; #elif (CUB_PTX_ARCH >= 500) typedef Policy500 PtxPolicy; #elif (CUB_PTX_ARCH >= 370) typedef Policy370 PtxPolicy; #else typedef Policy350 PtxPolicy; #endif // "Opaque" policies (whose parameterizations aren't reflected in the type signature) struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; //--------------------------------------------------------------------- // Utilities //--------------------------------------------------------------------- /** * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs( int ptx_version, KernelConfig &spmv_config, KernelConfig &segment_fixup_config) { NV_IF_TARGET( NV_IS_DEVICE, ( // We're on the device, so initialize the kernel dispatch // configurations with the current PTX policy spmv_config.template Init(); segment_fixup_config.template Init();), ( // We're on the host, so lookup and initialize the kernel dispatch // configurations with the policies that match the device's PTX // version if (ptx_version >= 600) { spmv_config.template Init(); segment_fixup_config .template Init(); } else if (ptx_version >= 500) { spmv_config.template Init(); segment_fixup_config .template Init(); } else if (ptx_version >= 370) { spmv_config.template Init(); segment_fixup_config .template Init(); } else { spmv_config.template Init(); segment_fixup_config .template Init(); })); } /** * Kernel kernel dispatch configuration. */ struct KernelConfig { int block_threads; int items_per_thread; int tile_items; template CUB_RUNTIME_FUNCTION __forceinline__ void Init() { block_threads = PolicyT::BLOCK_THREADS; items_per_thread = PolicyT::ITEMS_PER_THREAD; tile_items = block_threads * items_per_thread; } }; //--------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------- /** * Internal dispatch routine for computing a device-wide reduction using the * specified kernel functions. * * If the input is larger than a single tile, this method uses two-passes of * kernel invocations. */ template < typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SpmvParamsT& spmv_params, ///< SpMV input parameter bundle cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for { cudaError error = cudaSuccess; do { if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0) { return cudaErrorInvalidValue; } if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0) { // Empty problem, no-op. if (d_temp_storage == NULL) { temp_storage_bytes = 1; } break; } if (spmv_params.num_cols == 1) { if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation temp_storage_bytes = 1; break; } // Get search/init grid dims int degen_col_kernel_block_size = INIT_KERNEL_THREADS; int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); #endif // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream ).doit(spmv_1col_kernel, spmv_params); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } break; } // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Get SM count int sm_count; if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Total number of spmv work items int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; // Tile sizes of kernels int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; // Number of tiles for kernels int num_merge_tiles = cub::DivideAndRoundUp(num_merge_items, merge_tile_size); int num_segment_fixup_tiles = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size); // Get SM occupancy for kernels int spmv_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( spmv_sm_occupancy, spmv_kernel, spmv_config.block_threads))) break; int segment_fixup_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( segment_fixup_sm_occupancy, segment_fixup_kernel, segment_fixup_config.block_threads))) break; // Get grid dimensions dim3 spmv_grid_size( CUB_MIN(num_merge_tiles, max_dim_x), cub::DivideAndRoundUp(num_merge_tiles, max_dim_x), 1); dim3 segment_fixup_grid_size( CUB_MIN(num_segment_fixup_tiles, max_dim_x), cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x), 1); // Get the temporary storage allocation requirements size_t allocation_sizes[3]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) void* allocations[3] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; // Alias the other allocations KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates // Get search/init grid dims int search_block_size = INIT_KERNEL_THREADS; int search_grid_size = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size); if (search_grid_size < sm_count) // if (num_merge_tiles < spmv_sm_occupancy * sm_count) { // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords d_tile_coordinates = NULL; } else { // Use separate search kernel if we have enough spmv tiles to saturate the device // Log spmv_search_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", search_grid_size, search_block_size, (long long) stream); #endif // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( search_grid_size, search_block_size, 0, stream ).doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } // Log spmv_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); #endif // Invoke spmv_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( spmv_grid_size, spmv_config.block_threads, 0, stream ).doit(spmv_kernel, spmv_params, d_tile_coordinates, d_tile_carry_pairs, num_merge_tiles, tile_state, num_segment_fixup_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Run reduce-by-key fixup if necessary if (num_merge_tiles > 1) { // Log segment_fixup_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); #endif // Invoke segment_fixup_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream ).doit(segment_fixup_kernel, d_tile_carry_pairs, spmv_params.d_vector_y, num_merge_tiles, num_segment_fixup_tiles, tile_state); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream, bool debug_synchronous, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream, spmv_1col_kernel, spmv_search_kernel, spmv_kernel, segment_fixup_kernel, spmv_config, segment_fixup_config); } /** * Internal dispatch routine for computing a device-wide reduction */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SpmvParamsT& spmv_params, ///< SpMV input parameter bundle cudaStream_t stream = 0) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Get kernel kernel dispatch configurations KernelConfig spmv_config, segment_fixup_config; InitConfigs(ptx_version, spmv_config, segment_fixup_config); if (CubDebug(error = Dispatch( d_temp_storage, temp_storage_bytes, spmv_params, stream, DeviceSpmv1ColKernel, DeviceSpmvSearchKernel, DeviceSpmvKernel, DeviceSegmentFixupKernel, spmv_config, segment_fixup_config))) break; } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_three_way_partition.cuh000066400000000000000000000502421434614775400245540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ template __launch_bounds__(int(AgentThreeWayPartitionPolicyT::BLOCK_THREADS)) __global__ void DeviceThreeWayPartitionKernel(InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status_1, ScanTileStateT tile_status_2, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, int num_tiles) { // Thread block type for selecting data from input tiles using AgentThreeWayPartitionT = AgentThreeWayPartition; // Shared memory for AgentThreeWayPartition __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage; // Process tiles AgentThreeWayPartitionT(temp_storage, d_in, d_first_part_out, d_second_part_out, d_unselected_out, select_first_part_op, select_second_part_op, num_items) .ConsumeRange(num_tiles, tile_status_1, tile_status_2, d_num_selected_out); } /** * @brief Initialization kernel for tile status initialization (multi-block) * * @tparam ScanTileStateT * Tile status interface type * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @param[in] tile_state_1 * Tile status interface * * @param[in] tile_state_2 * Tile status interface * * @param[in] num_tiles * Number of tiles * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of @p d_selected_out) */ template __global__ void DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state_1, ScanTileStateT tile_state_2, int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state_1.InitializeStatus(num_tiles); tile_state_2.InitializeStatus(num_tiles); // Initialize d_num_selected_out if (blockIdx.x == 0) { if (threadIdx.x < 2) { d_num_selected_out[threadIdx.x] = 0; } } } /****************************************************************************** * Dispatch ******************************************************************************/ template struct DispatchThreeWayPartitionIf { /***************************************************************************** * Types and constants ****************************************************************************/ using InputT = cub::detail::value_t; using ScanTileStateT = cub::ScanTileState; constexpr static int INIT_KERNEL_THREADS = 256; /***************************************************************************** * Tuning policies ****************************************************************************/ /// SM35 struct Policy350 { constexpr static int ITEMS_PER_THREAD = Nominal4BItemsToItems(9); using ThreeWayPartitionPolicy = cub::AgentThreeWayPartitionPolicy<256, ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; }; /***************************************************************************** * Tuning policies of current PTX compiler pass ****************************************************************************/ using PtxPolicy = Policy350; // "Opaque" policies (whose parameterizations aren't reflected in the type signature) struct PtxThreeWayPartitionPolicyT : PtxPolicy::ThreeWayPartitionPolicy {}; /***************************************************************************** * Utilities ****************************************************************************/ /** * Initialize kernel dispatch configurations with the policies corresponding * to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs( int ptx_version, KernelConfig &select_if_config) { NV_IF_TARGET( NV_IS_DEVICE, ((void)ptx_version; // We're on the device, so initialize the kernel dispatch configurations // with the current PTX policy select_if_config.template Init();), (// We're on the host, so lookup and initialize the kernel dispatch // configurations with the policies that match the device's PTX version // (There's only one policy right now) (void)ptx_version; select_if_config .template Init();)); } /** * Kernel dispatch configuration. */ struct KernelConfig { int block_threads; int items_per_thread; int tile_items; template CUB_RUNTIME_FUNCTION __forceinline__ void Init() { block_threads = PolicyT::BLOCK_THREADS; items_per_thread = PolicyT::ITEMS_PER_THREAD; tile_items = block_threads * items_per_thread; } }; /***************************************************************************** * Dispatch entrypoints ****************************************************************************/ template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream, int /*ptx_version*/, ScanInitKernelPtrT three_way_partition_init_kernel, SelectIfKernelPtrT three_way_partition_kernel, KernelConfig three_way_partition_config) { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) { break; } // Number of input tiles int tile_size = three_way_partition_config.block_threads * three_way_partition_config.items_per_thread; int num_tiles = static_cast(DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[2]; // bytes needed for tile status descriptors if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) { break; } allocation_sizes[1] = allocation_sizes[0]; // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void* allocations[2] = {}; if (CubDebug(error = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Return if empty problem if (num_items == 0) { break; } // Construct the tile status interface ScanTileStateT tile_status_1; ScanTileStateT tile_status_2; if (CubDebug(error = tile_status_1.Init(num_tiles, allocations[0], allocation_sizes[0]))) { break; } if (CubDebug(error = tile_status_2.Init(num_tiles, allocations[1], allocation_sizes[1]))) { break; } // Log three_way_partition_init_kernel configuration int init_grid_size = CUB_MAX(1, DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, reinterpret_cast(stream)); #endif // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream ).doit(three_way_partition_init_kernel, tile_status_1, tile_status_2, num_tiles, d_num_selected_out); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) { break; } // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( range_select_sm_occupancy, // out three_way_partition_kernel, three_way_partition_config.block_threads))) { break; } _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, three_way_partition_config.block_threads, reinterpret_cast(stream), three_way_partition_config.items_per_thread, range_select_sm_occupancy); } #endif // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, three_way_partition_config.block_threads, 0, stream ).doit(three_way_partition_kernel, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, tile_status_1, tile_status_2, select_first_part_op, select_second_part_op, num_items, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, ScanInitKernelPtrT three_way_partition_init_kernel, SelectIfKernelPtrT three_way_partition_kernel, KernelConfig three_way_partition_config) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream, ptx_version, three_way_partition_init_kernel, three_way_partition_kernel, three_way_partition_config); } /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream) { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = cub::PtxVersion(ptx_version))) { break; } // Get kernel kernel dispatch configurations KernelConfig select_if_config; InitConfigs(ptx_version, select_if_config); // Dispatch if (CubDebug(error = Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream, ptx_version, DeviceThreeWayPartitionInitKernel, DeviceThreeWayPartitionKernel, select_if_config))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/device/dispatch/dispatch_unique_by_key.cuh000066400000000000000000000543101434614775400233440ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory. */ #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Unique by key kernel entry point (multi-block) */ template < typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type typename KeyInputIteratorT, ///< Random-access input iterator type for keys typename ValueInputIteratorT, ///< Random-access input iterator type for values typename KeyOutputIteratorT, ///< Random-access output iterator type for keys typename ValueOutputIteratorT, ///< Random-access output iterator type for values typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected typename ScanTileStateT, ///< Tile status interface type typename EqualityOpT, ///< Equality operator type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceUniqueByKeySweepKernel( KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) ScanTileStateT tile_state, ///< [in] Tile status interface EqualityOpT equality_op, ///< [in] Equality operator OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) int num_tiles) ///< [in] Total number of tiles for the entire problem { // Thread block type for selecting data from input tiles using AgentUniqueByKeyT = AgentUniqueByKey< typename AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, EqualityOpT, OffsetT>; // Shared memory for AgentUniqueByKey __shared__ typename AgentUniqueByKeyT::TempStorage temp_storage; // Process tiles AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items).ConsumeRange( num_tiles, tile_state, d_num_selected_out); } /****************************************************************************** * Policy ******************************************************************************/ template struct DeviceUniqueByKeyPolicy { using KeyT = typename std::iterator_traits::value_type; // SM350 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { const static int INPUT_SIZE = sizeof(KeyT); enum { NOMINAL_4B_ITEMS_PER_THREAD = 9, ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), }; using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; }; // SM520 struct Policy520 : ChainedPolicy<520, Policy520, Policy350> { const static int INPUT_SIZE = sizeof(KeyT); enum { NOMINAL_4B_ITEMS_PER_THREAD = 11, ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), }; using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<64, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; }; /// MaxPolicy using MaxPolicy = Policy520; }; /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect */ template < typename KeyInputIteratorT, ///< Random-access input iterator type for keys typename ValueInputIteratorT, ///< Random-access input iterator type for values typename KeyOutputIteratorT, ///< Random-access output iterator type for keys typename ValueOutputIteratorT, ///< Random-access output iterator type for values typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected typename EqualityOpT, ///< Equality operator type typename OffsetT, ///< Signed integer type for global offsets typename SelectedPolicy = DeviceUniqueByKeyPolicy> struct DispatchUniqueByKey: SelectedPolicy { /****************************************************************************** * Types and constants ******************************************************************************/ enum { INIT_KERNEL_THREADS = 128, }; // The input key and value type using KeyT = typename std::iterator_traits::value_type; using ValueT = typename std::iterator_traits::value_type; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeyInputIteratorT d_keys_in; ///< [in] Pointer to the input sequence of keys ValueInputIteratorT d_values_in; ///< [in] Pointer to the input sequence of values KeyOutputIteratorT d_keys_out; ///< [out] Pointer to the output sequence of selected data items ValueOutputIteratorT d_values_out; ///< [out] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out; ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) EqualityOpT equality_op; ///< [in] Equality operator OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) cudaStream_t stream; ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. CUB_RUNTIME_FUNCTION __forceinline__ DispatchUniqueByKey( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) EqualityOpT equality_op, ///< [in] Equality operator OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) cudaStream_t stream ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. ): d_temp_storage(d_temp_storage), temp_storage_bytes(temp_storage_bytes), d_keys_in(d_keys_in), d_values_in(d_values_in), d_keys_out(d_keys_out), d_values_out(d_values_out), d_num_selected_out(d_num_selected_out), equality_op(equality_op), num_items(num_items), stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ DispatchUniqueByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous ): d_temp_storage(d_temp_storage), temp_storage_bytes(temp_storage_bytes), d_keys_in(d_keys_in), d_values_in(d_values_in), d_keys_out(d_keys_out), d_values_out(d_values_out), d_num_selected_out(d_num_selected_out), equality_op(equality_op), num_items(num_items), stream(stream) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { using Policy = typename ActivePolicyT::UniqueByKeyPolicyT; using UniqueByKeyAgentT = AgentUniqueByKey; cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Size of virtual shared memory int max_shmem = 0; if (CubDebug( error = cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_ordinal))) { break; } std::size_t vshmem_size = detail::VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles); // Specify temporary storage allocation requirements size_t allocation_sizes[2] = {0, vshmem_size}; if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void *allocations[2] = {NULL, NULL}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log init_kernel configuration num_tiles = CUB_MAX(1, num_tiles); int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Return if empty problem if (num_items == 0) break; // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { // Get SM occupancy for unique_by_key_kernel int scan_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out scan_kernel, Policy::BLOCK_THREADS))) { break; } _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, " "%lld>>>(), %d items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, Policy::BLOCK_THREADS, (long long)stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); } #endif // Invoke select_if_kernel error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( scan_grid_size, Policy::BLOCK_THREADS, 0, stream ).doit(scan_kernel, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, tile_state, equality_op, num_items, num_tiles); // Check for failure to launch if (CubDebug(error)) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while(0); return error; } template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke() { // Ensure kernels are instantiated. return Invoke( DeviceCompactInitKernel, DeviceUniqueByKeySweepKernel< ActivePolicyT, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, EqualityOpT, OffsetT> ); } /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) EqualityOpT equality_op, ///< [in] Equality operator OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) cudaStream_t stream) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. { using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; cudaError_t error; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) break; // Create dispatch functor DispatchUniqueByKey dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, equality_op, num_items, stream); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void* d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, equality_op, num_items, stream); } }; CUB_NAMESPACE_END cub-2.0.1/cub/grid/000077500000000000000000000000001434614775400140005ustar00rootroot00000000000000cub-2.0.1/cub/grid/grid_barrier.cuh000066400000000000000000000131421434614775400171350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid */ #pragma once #include "../util_debug.cuh" #include "../config.cuh" #include "../thread/thread_load.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup GridModule * @{ */ /** * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid */ class GridBarrier { protected : typedef unsigned int SyncFlag; // Counters in global device memory SyncFlag* d_sync; public: /** * Constructor */ GridBarrier() : d_sync(NULL) {} /** * Synchronize */ __device__ __forceinline__ void Sync() const { volatile SyncFlag *d_vol_sync = d_sync; // Threadfence and syncthreads to make sure global writes are visible before // thread-0 reports in with its sync counter __threadfence(); CTA_SYNC(); if (blockIdx.x == 0) { // Report in ourselves if (threadIdx.x == 0) { d_vol_sync[blockIdx.x] = 1; } CTA_SYNC(); // Wait for everyone else to report in for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) { while (ThreadLoad(d_sync + peer_block) == 0) { __threadfence_block(); } } CTA_SYNC(); // Let everyone know it's safe to proceed for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) { d_vol_sync[peer_block] = 0; } } else { if (threadIdx.x == 0) { // Report in d_vol_sync[blockIdx.x] = 1; // Wait for acknowledgment while (ThreadLoad(d_sync + blockIdx.x) == 1) { __threadfence_block(); } } CTA_SYNC(); } } }; /** * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. * * Uses RAII for lifetime, i.e., device resources are reclaimed when * the destructor is called. */ class GridBarrierLifetime : public GridBarrier { protected: // Number of bytes backed by d_sync size_t sync_bytes; public: /** * Constructor */ GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} /** * DeviceFrees and resets the progress counters */ cudaError_t HostReset() { cudaError_t retval = cudaSuccess; if (d_sync) { CubDebug(retval = cudaFree(d_sync)); d_sync = NULL; } sync_bytes = 0; return retval; } /** * Destructor */ virtual ~GridBarrierLifetime() { HostReset(); } /** * Sets up the progress counters for the next kernel launch (lazily * allocating and initializing them if necessary) */ cudaError_t Setup(int sweep_grid_size) { cudaError_t retval = cudaSuccess; do { size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); if (new_sync_bytes > sync_bytes) { if (d_sync) { if (CubDebug(retval = cudaFree(d_sync))) break; } sync_bytes = new_sync_bytes; // Allocate and initialize to zero if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; } } while (0); return retval; } }; /** @} */ // end group GridModule CUB_NAMESPACE_END cub-2.0.1/cub/grid/grid_even_share.cuh000066400000000000000000000201371434614775400176300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). */ #pragma once #include "../config.cuh" #include "../util_namespace.cuh" #include "../util_macro.cuh" #include "../util_math.cuh" #include "../util_type.cuh" #include "grid_mapping.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup GridModule * @{ */ /** * \brief GridEvenShare is a descriptor utility for distributing input among * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly * the same number of input tiles. * * \par Overview * Each thread block is assigned a consecutive sequence of input tiles. To help * preserve alignment and eliminate the overhead of guarded loads for all but the * last thread block, to GridEvenShare assigns one of three different amounts of * work to a given thread block: "big", "normal", or "last". The "big" workloads * are one scheduling grain larger than "normal". The "last" work unit for the * last thread block may be partially-full if the input is not an even multiple of * the scheduling grain size. * * \par * Before invoking a child grid, a parent thread will typically construct an * instance of GridEvenShare. The instance can be passed to child thread blocks * which can initialize their per-thread block offsets using \p BlockInit(). */ template struct GridEvenShare { private: int total_tiles; int big_shares; OffsetT big_share_items; OffsetT normal_share_items; OffsetT normal_base_offset; public: /// Total number of input items OffsetT num_items; /// Grid size in thread blocks int grid_size; /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles OffsetT block_offset; /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles OffsetT block_end; /// Stride between input tiles OffsetT block_stride; /** * \brief Constructor. */ __host__ __device__ __forceinline__ GridEvenShare() : total_tiles(0), big_shares(0), big_share_items(0), normal_share_items(0), normal_base_offset(0), num_items(0), grid_size(0), block_offset(0), block_end(0), block_stride(0) {} /** * \brief Dispatch initializer. To be called prior prior to kernel launch. */ __host__ __device__ __forceinline__ void DispatchInit( OffsetT num_items_, ///< Total number of input items int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) int tile_items) ///< Number of data items per input tile { this->block_offset = num_items_; // Initialize past-the-end this->block_end = num_items_; // Initialize past-the-end this->num_items = num_items_; this->total_tiles = static_cast(cub::DivideAndRoundUp(num_items_, tile_items)); this->grid_size = CUB_MIN(total_tiles, max_grid_size); int avg_tiles_per_block = total_tiles / grid_size; // leftover grains go to big blocks: this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); this->normal_share_items = avg_tiles_per_block * tile_items; this->normal_base_offset = big_shares * tile_items; this->big_share_items = normal_share_items + tile_items; } /** * \brief Initializes ranges for the specified thread block index. Specialized * for a "raking" access pattern in which each thread block is assigned a * consecutive sequence of input tiles. */ template __device__ __forceinline__ void BlockInit( int block_id, Int2Type /*strategy_tag*/) { block_stride = TILE_ITEMS; if (block_id < big_shares) { // This thread block gets a big share of grains (avg_tiles_per_block + 1) block_offset = (block_id * big_share_items); block_end = block_offset + big_share_items; } else if (block_id < total_tiles) { // This thread block gets a normal share of grains (avg_tiles_per_block) block_offset = normal_base_offset + (block_id * normal_share_items); // Avoid generating values greater than num_items, as it may cause overflow block_end = block_offset + CUB_MIN(num_items - block_offset, normal_share_items); } // Else default past-the-end } /** * \brief Block-initialization, specialized for a "raking" access * pattern in which each thread block is assigned a consecutive sequence * of input tiles. */ template __device__ __forceinline__ void BlockInit( int block_id, Int2Type /*strategy_tag*/) { block_stride = grid_size * TILE_ITEMS; block_offset = (block_id * TILE_ITEMS); block_end = num_items; } /** * \brief Block-initialization, specialized for "strip mining" access * pattern in which the input tiles assigned to each thread block are * separated by a stride equal to the the extent of the grid. */ template < int TILE_ITEMS, GridMappingStrategy STRATEGY> __device__ __forceinline__ void BlockInit() { BlockInit(blockIdx.x, Int2Type()); } /** * \brief Block-initialization, specialized for a "raking" access * pattern in which each thread block is assigned a consecutive sequence * of input tiles. */ template __device__ __forceinline__ void BlockInit( OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) OffsetT block_end) ///< [in] Threadblock end offset (exclusive) { this->block_offset = block_offset; this->block_end = block_end; this->block_stride = TILE_ITEMS; } }; /** @} */ // end group GridModule CUB_NAMESPACE_END cub-2.0.1/cub/grid/grid_mapping.cuh000066400000000000000000000111301434614775400171350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. */ #pragma once #include "../config.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup GridModule * @{ */ /****************************************************************************** * Mapping policies *****************************************************************************/ /** * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. */ enum GridMappingStrategy { /** * \brief An a "raking" access pattern in which each thread block is * assigned a consecutive sequence of input tiles * * \par Overview * The input is evenly partitioned into \p p segments, where \p p is * constant and corresponds loosely to the number of thread blocks that may * actively reside on the target device. Each segment is comprised of * consecutive tiles, where a tile is a small, constant-sized unit of input * to be processed to completion before the thread block terminates or * obtains more work. The kernel invokes \p p thread blocks, each * of which iteratively consumes a segment of n/p elements * in tile-size increments. */ GRID_MAPPING_RAKE, /** * \brief An a "strip mining" access pattern in which the input tiles assigned * to each thread block are separated by a stride equal to the the extent of * the grid. * * \par Overview * The input is evenly partitioned into \p p sets, where \p p is * constant and corresponds loosely to the number of thread blocks that may * actively reside on the target device. Each set is comprised of * data tiles separated by stride \p tiles, where a tile is a small, * constant-sized unit of input to be processed to completion before the * thread block terminates or obtains more work. The kernel invokes \p p * thread blocks, each of which iteratively consumes a segment of * n/p elements in tile-size increments. */ GRID_MAPPING_STRIP_MINE, /** * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. * * \par Overview * The input is treated as a queue to be dynamically consumed by a grid of * thread blocks. Work is atomically dequeued in tiles, where a tile is a * unit of input to be processed to completion before the thread block * terminates or obtains more work. The grid size \p p is constant, * loosely corresponding to the number of thread blocks that may actively * reside on the target device. */ GRID_MAPPING_DYNAMIC, }; /** @} */ // end group GridModule CUB_NAMESPACE_END cub-2.0.1/cub/grid/grid_queue.cuh000066400000000000000000000173161434614775400166420ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::GridQueue is a descriptor utility for dynamic queue management. */ #pragma once #include #include #include CUB_NAMESPACE_BEGIN /** * \addtogroup GridModule * @{ */ /** * \brief GridQueue is a descriptor utility for dynamic queue management. * * \par Overview * GridQueue descriptors provides abstractions for "filling" or * "draining" globally-shared vectors. * * \par * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, * returning a unique offset for the calling thread to write its items. * The GridQueue maintains the total "fill-size". The fill counter must be reset * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that * will be filling. * * \par * Similarly, a "draining" GridQueue works by works by atomically-incrementing a * zero-initialized counter, returning a unique offset for the calling thread to * read its items. Threads can safely drain until the array's logical fill-size is * exceeded. The drain counter must be reset using GridQueue::ResetDrain or * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size * is simply the number of elements in the array.) * * \par * Iterative work management can be implemented simply with a pair of flip-flopping * work buffers, each with an associated set of fill and drain GridQueue descriptors. * * \tparam OffsetT Signed integer type for global offsets */ template class GridQueue { private: /// Counter indices enum { FILL = 0, DRAIN = 1, }; /// Pair of counters OffsetT *d_counters; public: /// Returns the device allocation size in bytes needed to construct a GridQueue instance __host__ __device__ __forceinline__ static size_t AllocationSize() { return sizeof(OffsetT) * 2; } /// Constructs an invalid GridQueue descriptor __host__ __device__ __forceinline__ GridQueue() : d_counters(NULL) {} /// Constructs a GridQueue descriptor around the device storage allocation __host__ __device__ __forceinline__ GridQueue( void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). : d_counters((OffsetT*) d_storage) {} /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( OffsetT fill_size, cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ( (void)stream; d_counters[FILL] = fill_size; d_counters[DRAIN] = 0; result = cudaSuccess; ), ( OffsetT counters[2]; counters[FILL] = fill_size; counters[DRAIN] = 0; result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); )); return result; } /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ( (void)stream; d_counters[DRAIN] = 0; result = cudaSuccess; ), ( result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); )); return result; } /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ( (void)stream; d_counters[FILL] = 0; result = cudaSuccess; ), ( result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); )); return result; } /// Returns the fill-size established by the parent or by the previous kernel. __host__ __device__ __forceinline__ cudaError_t FillSize( OffsetT &fill_size, cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ( (void)stream; fill_size = d_counters[FILL]; result = cudaSuccess; ), ( result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); )); return result; } /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. __device__ __forceinline__ OffsetT Drain(OffsetT num_items) { return atomicAdd(d_counters + DRAIN, num_items); } /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. __device__ __forceinline__ OffsetT Fill(OffsetT num_items) { return atomicAdd(d_counters + FILL, num_items); } }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Reset grid queue (call with 1 block of 1 thread) */ template __global__ void FillAndResetDrainKernel( GridQueue grid_queue, OffsetT num_items) { grid_queue.FillAndResetDrain(num_items); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** @} */ // end group GridModule CUB_NAMESPACE_END cub-2.0.1/cub/host/000077500000000000000000000000001434614775400140305ustar00rootroot00000000000000cub-2.0.1/cub/host/mutex.cuh000066400000000000000000000103451434614775400156760ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Simple portable mutex */ #include "../util_cpp_dialect.cuh" #pragma once #if CUB_CPP_DIALECT >= 2011 #include #else #if defined(_WIN32) || defined(_WIN64) #include #define WIN32_LEAN_AND_MEAN #define NOMINMAX #include #undef WIN32_LEAN_AND_MEAN #undef NOMINMAX /** * Compiler read/write barrier */ #pragma intrinsic(_ReadWriteBarrier) #endif #endif #include "../config.cuh" CUB_NAMESPACE_BEGIN /** * Simple portable mutex * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) */ struct Mutex { #if CUB_CPP_DIALECT >= 2011 std::mutex mtx; void Lock() { mtx.lock(); } void Unlock() { mtx.unlock(); } #else // C++11 #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC // Microsoft VC++ typedef long Spinlock; #else // GNU g++ typedef int Spinlock; /** * Compiler read/write barrier */ __forceinline__ void _ReadWriteBarrier() { __sync_synchronize(); } /** * Atomic exchange */ __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) { // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier _ReadWriteBarrier(); return __sync_lock_test_and_set(Target, Value); } /** * Pause instruction to prevent excess processor bus usage */ __forceinline__ void YieldProcessor() { } #endif // MSVC /// Lock member volatile Spinlock lock; /** * Constructor */ Mutex() : lock(0) {} /** * Return when the specified spinlock has been acquired */ __forceinline__ void Lock() { while (1) { if (!_InterlockedExchange(&lock, 1)) return; while (lock) YieldProcessor(); } } /** * Release the specified spinlock */ __forceinline__ void Unlock() { _ReadWriteBarrier(); lock = 0; } #endif // C++11 }; CUB_NAMESPACE_END cub-2.0.1/cub/iterator/000077500000000000000000000000001434614775400147045ustar00rootroot00000000000000cub-2.0.1/cub/iterator/arg_index_input_iterator.cuh000066400000000000000000000207341434614775400225030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../config.cuh" #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../util_device.cuh" #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). * * \par Overview * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose * \p key field is \p i and whose \p value field is itr[i]. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped * device memory can only be dereferenced on the device. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto * dereference an array of doubles * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize a device array * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::ArgIndexInputIterator itr(d_in); * * // Within device code: * typedef typename cub::ArgIndexInputIterator::value_type Tuple; * Tuple item_offset_pair.key = *itr; * printf("%f @ %d\n", * item_offset_pair.value, * item_offset_pair.key); // 8.0 @ 0 * * itr = itr + 6; * item_offset_pair.key = *itr; * printf("%f @ %d\n", * item_offset_pair.value, * item_offset_pair.key); // 9.0 @ 6 * * \endcode * * \tparam InputIteratorT The value type of the wrapped input iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) */ template < typename InputIteratorT, typename OffsetT = ptrdiff_t, typename OutputValueT = cub::detail::value_t> class ArgIndexInputIterator { public: // Required iterator traits typedef ArgIndexInputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef KeyValuePair value_type; ///< The type of the element the iterator can point to typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to typedef value_type reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: InputIteratorT itr; difference_type offset; public: /// Constructor __host__ __device__ __forceinline__ ArgIndexInputIterator( InputIteratorT itr, ///< Input iterator to wrap difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator : itr(itr), offset(offset) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; offset++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { offset++; return *this; } /// Indirection __host__ __device__ __forceinline__ reference operator*() const { value_type retval; retval.value = itr[offset]; retval.key = offset; return retval; } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(itr, offset + n); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { offset += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(itr, offset - n); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { offset -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return offset - other.offset; } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance n) const { self_type offset = (*this) + n; return *offset; } /// Structure dereference __host__ __device__ __forceinline__ pointer operator->() { return &(*(*this)); } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return ((itr == rhs.itr) && (offset == rhs.offset)); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return ((itr != rhs.itr) || (offset != rhs.offset)); } /// Normalize __host__ __device__ __forceinline__ void normalize() { itr += offset; offset = 0; } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) { return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/cache_modified_input_iterator.cuh000066400000000000000000000175101434614775400234440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../config.cuh" #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. * * \par Overview * - CacheModifiedInputIterator is a random-access input iterator that wraps a native * device pointer of type ValueType*. \p ValueType references are * made by reading \p ValueType values through loads modified by \p MODIFIER. * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). * - Can be constructed, manipulated, and exchanged within and between host and device * functions, but can only be dereferenced within device functions. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p CacheModifiedInputIterator to * dereference a device array of double using the "ldg" PTX load modifier * (i.e., load values through texture cache). * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize a device array * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::CacheModifiedInputIterator itr(d_in); * * // Within device code: * printf("%f\n", itr[0]); // 8.0 * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * * \endcode * * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data * \tparam ValueType The value type of this iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT = ptrdiff_t> class CacheModifiedInputIterator { public: // Required iterator traits typedef CacheModifiedInputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION public: /// Wrapped native pointer ValueType* ptr; /// Constructor template __host__ __device__ __forceinline__ CacheModifiedInputIterator( QualifiedValueType* ptr) ///< Native pointer to wrap : ptr(const_cast::type *>(ptr)) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; ptr++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { ptr++; return *this; } /// Indirection __device__ __forceinline__ reference operator*() const { return ThreadLoad(ptr); } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(ptr + n); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { ptr += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(ptr - n); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { ptr -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return ptr - other.ptr; } /// Array subscript template __device__ __forceinline__ reference operator[](Distance n) const { return ThreadLoad(ptr + n); } /// Structure dereference __device__ __forceinline__ pointer operator->() { return &ThreadLoad(ptr); } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return (ptr == rhs.ptr); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return (ptr != rhs.ptr); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) { return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/cache_modified_output_iterator.cuh000066400000000000000000000200441434614775400236410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" #include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. * * \par Overview * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native * device pointer of type ValueType*. \p ValueType references are * made by writing \p ValueType values through stores modified by \p MODIFIER. * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", * "STORE_CG", "STORE_CS", "STORE_WT", etc.). * - Can be constructed, manipulated, and exchanged within and between host and device * functions, but can only be dereferenced within device functions. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to * dereference a device array of doubles using the "wt" PTX load modifier * (i.e., write-through to system memory). * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize a device array * double *d_out; // e.g., [, , , , , , ] * * // Create an iterator wrapper * cub::CacheModifiedOutputIterator itr(d_out); * * // Within device code: * itr[0] = 8.0; * itr[1] = 66.0; * itr[55] = 24.0; * * \endcode * * \par Usage Considerations * - Can only be dereferenced within device code * * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data * \tparam ValueType The value type of this iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < CacheStoreModifier MODIFIER, typename ValueType, typename OffsetT = ptrdiff_t> class CacheModifiedOutputIterator { private: // Proxy object struct Reference { ValueType* ptr; /// Constructor __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} /// Assignment __device__ __forceinline__ ValueType operator =(ValueType val) { ThreadStore(ptr, val); return val; } }; public: // Required iterator traits typedef CacheModifiedOutputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef void value_type; ///< The type of the element the iterator can point to typedef void pointer; ///< The type of a pointer to an element the iterator can point to typedef Reference reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: ValueType* ptr; public: /// Constructor template __host__ __device__ __forceinline__ CacheModifiedOutputIterator( QualifiedValueType* ptr) ///< Native pointer to wrap : ptr(const_cast::type *>(ptr)) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; ptr++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { ptr++; return *this; } /// Indirection __host__ __device__ __forceinline__ reference operator*() const { return Reference(ptr); } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(ptr + n); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { ptr += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(ptr - n); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { ptr -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return ptr - other.ptr; } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance n) const { return Reference(ptr + n); } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return (ptr == rhs.ptr); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return (ptr != rhs.ptr); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/constant_input_iterator.cuh000066400000000000000000000166061434614775400223770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input generator for dereferencing a sequence of homogeneous values * * \par Overview * - Read references to a ConstantInputIteratorTiterator always return the supplied constant * of type \p ValueType. * - Can be used with any data type. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p ConstantInputIteratorTto * dereference a sequence of homogeneous doubles. * \par * \code * #include // or equivalently * * cub::ConstantInputIterator itr(5.0); * * printf("%f\n", itr[0]); // 5.0 * printf("%f\n", itr[1]); // 5.0 * printf("%f\n", itr[2]); // 5.0 * printf("%f\n", itr[50]); // 5.0 * * \endcode * * \tparam ValueType The value type of this iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename ValueType, typename OffsetT = ptrdiff_t> class ConstantInputIterator { public: // Required iterator traits typedef ConstantInputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: ValueType val; OffsetT offset; #ifdef _WIN32 OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) #endif public: /// Constructor __host__ __device__ __forceinline__ ConstantInputIterator( ValueType val, ///< Starting value for the iterator instance to report OffsetT offset = 0) ///< Base offset : val(val), offset(offset) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; offset++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { offset++; return *this; } /// Indirection __host__ __device__ __forceinline__ reference operator*() const { return val; } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(val, offset + n); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { offset += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(val, offset - n); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { offset -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return offset - other.offset; } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const { return val; } /// Structure dereference __host__ __device__ __forceinline__ pointer operator->() { return &val; } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return (offset == rhs.offset) && ((val == rhs.val)); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return (offset != rhs.offset) || (val!= rhs.val); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.val << "," << itr.offset << "]"; return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/counting_input_iterator.cuh000066400000000000000000000161551434614775400223730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" #include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. * * \par Overview * - After initializing a CountingInputIteratorTto a certain integer \p base, read references * at \p offset will return the value \p base + \p offset. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p CountingInputIteratorTto * dereference a sequence of incrementing integers. * \par * \code * #include // or equivalently * * cub::CountingInputIterator itr(5); * * printf("%d\n", itr[0]); // 5 * printf("%d\n", itr[1]); // 6 * printf("%d\n", itr[2]); // 7 * printf("%d\n", itr[50]); // 55 * * \endcode * * \tparam ValueType The value type of this iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename ValueType, typename OffsetT = ptrdiff_t> class CountingInputIterator { public: // Required iterator traits typedef CountingInputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: ValueType val; public: /// Constructor __host__ __device__ __forceinline__ CountingInputIterator( const ValueType &val) ///< Starting value for the iterator instance to report : val(val) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; val++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { val++; return *this; } /// Indirection __host__ __device__ __forceinline__ reference operator*() const { return val; } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(val + (ValueType) n); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { val += (ValueType) n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(val - (ValueType) n); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { val -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return (difference_type) (val - other.val); } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance n) const { return val + (ValueType) n; } /// Structure dereference __host__ __device__ __forceinline__ pointer operator->() { return &val; } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return (val == rhs.val); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return (val != rhs.val); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.val << "]"; return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/discard_output_iterator.cuh000066400000000000000000000146171434614775400223600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../config.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A discard iterator */ template class DiscardOutputIterator { public: // Required iterator traits typedef DiscardOutputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef void value_type; ///< The type of the element the iterator can point to typedef void pointer; ///< The type of a pointer to an element the iterator can point to typedef void reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: OffsetT offset; #if defined(_WIN32) || !defined(_WIN64) // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))] = {}; #endif public: /// Constructor __host__ __device__ __forceinline__ DiscardOutputIterator( OffsetT offset = 0) ///< Base offset : offset(offset) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; offset++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { offset++; return *this; } /// Indirection __host__ __device__ __forceinline__ self_type& operator*() { // return self reference, which can be assigned to anything return *this; } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(offset + n); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { offset += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(offset - n); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { offset -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return offset - other.offset; } /// Array subscript template __host__ __device__ __forceinline__ self_type& operator[](Distance n) { // return self reference, which can be assigned to anything return *this; } /// Structure dereference __host__ __device__ __forceinline__ pointer operator->() { return; } /// Assignment to anything else (no-op) template __host__ __device__ __forceinline__ void operator=(T const&) {} /// Cast to void* operator __host__ __device__ __forceinline__ operator void*() const { return NULL; } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return (offset == rhs.offset); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return (offset != rhs.offset); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.offset << "]"; return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/tex_obj_input_iterator.cuh000066400000000000000000000252131434614775400221720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include #include #include #include #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. * * \par Overview * - TexObjInputIterator wraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be * created by the host thread, but can be used by any descendant kernel. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p TexObjInputIterator to * dereference a device array of doubles through texture cache. * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize a device array * int num_items; // e.g., 7 * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::TexObjInputIterator itr; * itr.BindTexture(d_in, sizeof(double) * num_items); * ... * * // Within device code: * printf("%f\n", itr[0]); // 8.0 * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * * ... * itr.UnbindTexture(); * * \endcode * * \tparam T The value type of this iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename T, typename OffsetT = ptrdiff_t> class TexObjInputIterator { public: // Required iterator traits typedef TexObjInputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef T value_type; ///< The type of the element the iterator can point to typedef T* pointer; ///< The type of a pointer to an element the iterator can point to typedef T reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: // Largest texture word we can use in device typedef typename UnitWord::TextureWord TextureWord; // Number of texture words per T enum { TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) }; private: T* ptr; difference_type tex_offset; cudaTextureObject_t tex_obj; public: /// Constructor __host__ __device__ __forceinline__ TexObjInputIterator() : ptr(NULL), tex_offset(0), tex_obj(0) {} /// Use this iterator to bind \p ptr with a texture reference template cudaError_t BindTexture( QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment size_t bytes, ///< Number of bytes in the range size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator { this->ptr = const_cast::type *>(ptr); this->tex_offset = static_cast(tex_offset); cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); cudaResourceDesc res_desc; cudaTextureDesc tex_desc; memset(&res_desc, 0, sizeof(cudaResourceDesc)); memset(&tex_desc, 0, sizeof(cudaTextureDesc)); res_desc.resType = cudaResourceTypeLinear; res_desc.res.linear.devPtr = this->ptr; res_desc.res.linear.desc = channel_desc; res_desc.res.linear.sizeInBytes = bytes; tex_desc.readMode = cudaReadModeElementType; return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL)); } /// Unbind this iterator from its texture reference cudaError_t UnbindTexture() { return CubDebug(cudaDestroyTextureObject(tex_obj)); } /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; tex_offset++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { tex_offset++; return *this; } /// Indirection __host__ __device__ __forceinline__ reference operator*() const { NV_IF_TARGET(NV_IS_HOST, (return ptr[tex_offset];), (return this->device_deref();)); } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval; retval.ptr = ptr; retval.tex_obj = tex_obj; retval.tex_offset = tex_offset + n; return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { tex_offset += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval; retval.ptr = ptr; retval.tex_obj = tex_obj; retval.tex_offset = tex_offset - n; return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { tex_offset -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return tex_offset - other.tex_offset; } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance n) const { self_type offset = (*this) + n; return *offset; } /// Structure dereference __host__ __device__ __forceinline__ pointer operator->() { return &(*(*this)); } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "cub::TexObjInputIterator( ptr=" << itr.ptr << ", offset=" << itr.tex_offset << ", tex_obj=" << itr.tex_obj << " )"; return os; } private: // This is hoisted out of operator* because #pragma can't be used inside of // NV_IF_TARGET __device__ __forceinline__ reference device_deref() const { // Move array of uninitialized words, then alias and assign to return // value TextureWord words[TEXTURE_MULTIPLE]; const auto tex_idx_base = tex_offset * TEXTURE_MULTIPLE; #pragma unroll for (int i = 0; i < TEXTURE_MULTIPLE; ++i) { words[i] = tex1Dfetch(tex_obj, tex_idx_base + i); } // Load from words return *reinterpret_cast(words); } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/tex_ref_input_iterator.cuh000066400000000000000000000107451434614775400222000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input wrapper for dereferencing array values through texture cache. * * \deprecated [Since 1.13.0] The CUDA texture management APIs used by * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead. * * \par Overview * - TexRefInputIterator wraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture * reference. Only one TexRefInputIterator instance can be bound at any given time for a * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host * thread, and (4) compilation .o unit. * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be * created by the host thread and used by a top-level kernel (i.e. the one which is launched * from the host). * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p TexRefInputIterator to * dereference a device array of doubles through texture cache. * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize a device array * int num_items; // e.g., 7 * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::TexRefInputIterator itr; * itr.BindTexture(d_in, sizeof(double) * num_items); * ... * * // Within device code: * printf("%f\n", itr[0]); // 8.0 * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * * ... * itr.UnbindTexture(); * * \endcode * * \tparam T The value type of this iterator * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename T, int /*UNIQUE_ID*/, typename OffsetT = std::ptrdiff_t> using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/iterator/transform_input_iterator.cuh000066400000000000000000000202721434614775400225530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" #include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include #include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIterator * @{ */ /** * \brief A random-access input wrapper for transforming dereferenced values. * * \par Overview * - TransformInputIteratorTwraps a unary conversion functor of type \p * ConversionOp and a random-access input iterator of type InputIteratorT, * using the former to produce references of type \p ValueType from the latter. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped * device memory can only be dereferenced on the device. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet * The code snippet below illustrates the use of \p TransformInputIteratorTto * dereference an array of integers, tripling the values and converting them to doubles. * \par * \code * #include // or equivalently * * // Functor for tripling integer values and converting to doubles * struct TripleDoubler * { * __host__ __device__ __forceinline__ * double operator()(const int &a) const { * return double(a * 3); * } * }; * * // Declare, allocate, and initialize a device array * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * TripleDoubler conversion_op; * * // Create an iterator wrapper * cub::TransformInputIterator itr(d_in, conversion_op); * * // Within device code: * printf("%f\n", itr[0]); // 24.0 * printf("%f\n", itr[1]); // 18.0 * printf("%f\n", itr[6]); // 27.0 * * \endcode * * \tparam ValueType The value type of this iterator * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). * \tparam InputIteratorT The type of the wrapped input iterator * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) * */ template < typename ValueType, typename ConversionOp, typename InputIteratorT, typename OffsetT = ptrdiff_t> class TransformInputIterator { public: // Required iterator traits typedef TransformInputIterator self_type; ///< My own type typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category #else typedef std::random_access_iterator_tag iterator_category; ///< The iterator category #endif // THRUST_VERSION private: ConversionOp conversion_op; InputIteratorT input_itr; public: /// Constructor __host__ __device__ __forceinline__ TransformInputIterator( InputIteratorT input_itr, ///< Input iterator to wrap ConversionOp conversion_op) ///< Conversion functor to wrap : conversion_op(conversion_op), input_itr(input_itr) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { self_type retval = *this; input_itr++; return retval; } /// Prefix increment __host__ __device__ __forceinline__ self_type operator++() { input_itr++; return *this; } /// Indirection __host__ __device__ __forceinline__ reference operator*() const { return conversion_op(*input_itr); } /// Addition template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { self_type retval(input_itr + n, conversion_op); return retval; } /// Addition assignment template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { input_itr += n; return *this; } /// Subtraction template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { self_type retval(input_itr - n, conversion_op); return retval; } /// Subtraction assignment template __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { input_itr -= n; return *this; } /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { return input_itr - other.input_itr; } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance n) const { return conversion_op(input_itr[n]); } /// Equal to __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { return (input_itr == rhs.input_itr); } /// Not equal to __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { return (input_itr != rhs.input_itr); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */) { return os; } }; /** @} */ // end group UtilIterator CUB_NAMESPACE_END cub-2.0.1/cub/thread/000077500000000000000000000000001434614775400143225ustar00rootroot00000000000000cub-2.0.1/cub/thread/thread_load.cuh000066400000000000000000000432051434614775400172750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Thread utilities for reading memory using PTX cache modifiers. */ #pragma once #include #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIo * @{ */ //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- /** * \brief Enumeration of cache modifiers for memory load operations. */ enum CacheLoadModifier { LOAD_DEFAULT, ///< Default (no modifier) LOAD_CA, ///< Cache at all levels LOAD_CG, ///< Cache at global level LOAD_CS, ///< Cache streaming (likely to be accessed once) LOAD_CV, ///< Cache as volatile (including cached system lines) LOAD_LDG, ///< Cache as texture LOAD_VOLATILE, ///< Volatile (any memory space) }; /** * \name Thread I/O (cache modified) * @{ */ /** * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. * * \par Example * \code * #include // or equivalently * * // 32-bit load using cache-global modifier: * int *d_in; * int val = cub::ThreadLoad(d_in + threadIdx.x); * * // 16-bit load using default modifier * short *d_in; * short val = cub::ThreadLoad(d_in + threadIdx.x); * * // 256-bit load using cache-volatile modifier * double4 *d_in; * double4 val = cub::ThreadLoad(d_in + threadIdx.x); * * // 96-bit load using cache-streaming modifier * struct TestFoo { bool a; short b; }; * TestFoo *d_struct; * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); * \endcode * * \tparam MODIFIER [inferred] CacheLoadModifier enumeration * \tparam InputIteratorT [inferred] Input iterator type \iterator */ template __device__ __forceinline__ cub::detail::value_t ThreadLoad(InputIteratorT itr); //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Helper structure for templated load iteration (inductive case) template struct IterateThreadLoad { template static __device__ __forceinline__ void Load(T const *ptr, T *vals) { vals[COUNT] = ThreadLoad(ptr + COUNT); IterateThreadLoad::template Load(ptr, vals); } template static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) { vals[COUNT] = itr[COUNT]; IterateThreadLoad::Dereference(itr, vals); } }; /// Helper structure for templated load iteration (termination case) template struct IterateThreadLoad { template static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} template static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} }; /** * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ { \ uint4 retval; \ asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ "=r"(retval.x), \ "=r"(retval.y), \ "=r"(retval.z), \ "=r"(retval.w) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template<> \ __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ { \ ulonglong2 retval; \ asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ "=l"(retval.x), \ "=l"(retval.y) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ { \ ushort4 retval; \ asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ "=h"(retval.x), \ "=h"(retval.y), \ "=h"(retval.z), \ "=h"(retval.w) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template<> \ __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ { \ uint2 retval; \ asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ "=r"(retval.x), \ "=r"(retval.y) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template<> \ __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ { \ unsigned long long retval; \ asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ "=l"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ { \ unsigned int retval; \ asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ "=r"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ { \ unsigned short retval; \ asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ "=h"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ { \ unsigned short retval; \ asm volatile ( \ "{" \ " .reg .u8 datum;" \ " ld."#ptx_modifier".u8 datum, [%1];" \ " cvt.u16.u8 %0, datum;" \ "}" : \ "=h"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return (unsigned char) retval; \ } /** * Define powers-of-two ThreadLoad specializations for the given Cache load modifier */ #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ _CUB_LOAD_16(cub_modifier, ptx_modifier) \ _CUB_LOAD_8(cub_modifier, ptx_modifier) \ _CUB_LOAD_4(cub_modifier, ptx_modifier) \ _CUB_LOAD_2(cub_modifier, ptx_modifier) \ _CUB_LOAD_1(cub_modifier, ptx_modifier) \ /** * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers */ _CUB_LOAD_ALL(LOAD_CA, ca) _CUB_LOAD_ALL(LOAD_CG, cg) _CUB_LOAD_ALL(LOAD_CS, cs) _CUB_LOAD_ALL(LOAD_CV, cv) _CUB_LOAD_ALL(LOAD_LDG, global.nc) // Macro cleanup #undef _CUB_LOAD_ALL #undef _CUB_LOAD_1 #undef _CUB_LOAD_2 #undef _CUB_LOAD_4 #undef _CUB_LOAD_8 #undef _CUB_LOAD_16 /** * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ template __device__ __forceinline__ cub::detail::value_t ThreadLoad(InputIteratorT itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *itr; } /** * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types */ template __device__ __forceinline__ T ThreadLoad( T *ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *ptr; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types */ template __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, Int2Type /*is_primitive*/) { T retval = *reinterpret_cast(ptr); return retval; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types */ template __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, Int2Type /*is_primitive*/) { typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); T retval; VolatileWord *words = reinterpret_cast(&retval); IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( reinterpret_cast(ptr), words); return retval; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types */ template __device__ __forceinline__ T ThreadLoad( T *ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { // Apply tags for partial-specialization return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); } /** * ThreadLoad definition for generic modifiers on pointer types */ template __device__ __forceinline__ T ThreadLoad( T const *ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { typedef typename UnitWord::DeviceWord DeviceWord; const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); DeviceWord words[DEVICE_MULTIPLE]; IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( reinterpret_cast(const_cast(ptr)), words); return *reinterpret_cast(words); } /** * ThreadLoad definition for generic modifiers */ template < CacheLoadModifier MODIFIER, typename InputIteratorT> __device__ __forceinline__ cub::detail::value_t ThreadLoad(InputIteratorT itr) { // Apply tags for partial-specialization return ThreadLoad( itr, Int2Type(), Int2Type::value>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** @} */ // end group UtilIo CUB_NAMESPACE_END cub-2.0.1/cub/thread/thread_operators.cuh000066400000000000000000000255761434614775400204070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Simple binary operator functor types */ /****************************************************************************** * Simple functor operators ******************************************************************************/ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * @addtogroup UtilModule * @{ */ /// @brief Default equality functor struct Equality { /// Boolean equality operator, returns `t == u` template __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) const { return ::cuda::std::forward(t) == ::cuda::std::forward(u); } }; /// @brief Default inequality functor struct Inequality { /// Boolean inequality operator, returns `t != u` template __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) const { return ::cuda::std::forward(t) != ::cuda::std::forward(u); } }; /// @brief Inequality functor (wraps equality functor) template struct InequalityWrapper { /// Wrapped equality operator EqualityOp op; /// Constructor __host__ __device__ __forceinline__ InequalityWrapper(EqualityOp op) : op(op) {} /// Boolean inequality operator, returns `t != u` template __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) { return !op(std::forward(t), std::forward(u)); } }; /// @brief Default sum functor struct Sum { /// Binary sum operator, returns `t + u` template __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const -> decltype(::cuda::std::forward(t) + ::cuda::std::forward(u)) { return ::cuda::std::forward(t) + ::cuda::std::forward(u); } }; /// @brief Default difference functor struct Difference { /// Binary difference operator, returns `t - u` template __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const -> decltype(::cuda::std::forward(t) - ::cuda::std::forward(u)) { return ::cuda::std::forward(t) - ::cuda::std::forward(u); } }; /// @brief Default division functor struct Division { /// Binary division operator, returns `t / u` template __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const -> decltype(::cuda::std::forward(t) / ::cuda::std::forward(u)) { return ::cuda::std::forward(t) / ::cuda::std::forward(u); } }; /// @brief Default max functor struct Max { /// Boolean max operator, returns `(t > u) ? t : u` template __host__ __device__ __forceinline__ typename ::cuda::std::common_type::type operator()(T &&t, U &&u) const { return CUB_MAX(t, u); } }; /// @brief Arg max functor (keeps the value and offset of the first occurrence /// of the larger item) struct ArgMax { /// Boolean max operator, preferring the item having the smaller offset in /// case of ties template __host__ __device__ __forceinline__ KeyValuePair operator()(const KeyValuePair &a, const KeyValuePair &b) const { // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) // return ((b.value > a.value) || // ((a.value == b.value) && (b.key < a.key))) // ? b : a; if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) { return b; } return a; } }; /// @brief Default min functor struct Min { /// Boolean min operator, returns `(t < u) ? t : u` template __host__ __device__ __forceinline__ typename ::cuda::std::common_type::type operator()(T &&t, U &&u) const { return CUB_MIN(t, u); } }; /// @brief Arg min functor (keeps the value and offset of the first occurrence /// of the smallest item) struct ArgMin { /// Boolean min operator, preferring the item having the smaller offset in /// case of ties template __host__ __device__ __forceinline__ KeyValuePair operator()(const KeyValuePair &a, const KeyValuePair &b) const { // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) // return ((b.value < a.value) || // ((a.value == b.value) && (b.key < a.key))) // ? b : a; if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) { return b; } return a; } }; /// @brief Default cast functor template struct CastOp { /// Cast operator, returns `(B) a` template __host__ __device__ __forceinline__ B operator()(A &&a) const { return (B)a; } }; /// @brief Binary operator wrapper for switching non-commutative scan arguments template class SwizzleScanOp { private: /// Wrapped scan operator ScanOp scan_op; public: /// Constructor __host__ __device__ __forceinline__ SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} /// Switch the scan arguments template __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) { T _a(a); T _b(b); return scan_op(_b, _a); } }; /** * @brief Reduce-by-segment functor. * * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative * combining operator `f(const T &x, const T &y)`, an instance of this functor * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose * `value` field is either `b.value` if `b.key` is non-zero, or * `f(a.value, b.value)` otherwise. * * ReduceBySegmentOp is an associative, non-commutative binary combining * operator for input sequences of cub::KeyValuePair pairings. Such sequences * are typically used to represent a segmented set of values to be reduced * and a corresponding set of {0,1}-valued integer "head flags" demarcating the * first value of each segment. * * @tparam ReductionOpT Binary reduction operator to apply to values */ template struct ReduceBySegmentOp { /// Wrapped reduction operator ReductionOpT op; /// Constructor __host__ __device__ __forceinline__ ReduceBySegmentOp() {} /// Constructor __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} /** * @brief Scan operator * * @tparam KeyValuePairT * KeyValuePair pairing of T (value) and OffsetT (head flag) * * @param[in] first * First partial reduction * * @param[in] second * Second partial reduction */ template __host__ __device__ __forceinline__ KeyValuePairT operator()(const KeyValuePairT &first, const KeyValuePairT &second) { KeyValuePairT retval; retval.key = first.key + second.key; #ifdef _NVHPC_CUDA // WAR bug on nvc++ if (second.key) { retval.value = second.value; } else { // If second.value isn't copied into a temporary here, nvc++ will // crash while compiling the TestScanByKeyWithLargeTypes test in // thrust/testing/scan_by_key.cu: auto v2 = second.value; retval.value = op(first.value, v2); } #else // not nvc++: // if (second.key) { // The second partial reduction spans a segment reset, so it's value // aggregate becomes the running aggregate // else { // The second partial reduction does not span a reset, so accumulate both // into the running aggregate // } retval.value = (second.key) ? second.value : op(first.value, second.value); #endif return retval; } }; /** * @tparam ReductionOpT Binary reduction operator to apply to values */ template struct ReduceByKeyOp { /// Wrapped reduction operator ReductionOpT op; /// Constructor __host__ __device__ __forceinline__ ReduceByKeyOp() {} /// Constructor __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} /** * @brief Scan operator * * @param[in] first First partial reduction * @param[in] second Second partial reduction */ template __host__ __device__ __forceinline__ KeyValuePairT operator()(const KeyValuePairT &first, const KeyValuePairT &second) { KeyValuePairT retval = second; if (first.key == second.key) { retval.value = op(first.value, retval.value); } return retval; } }; template struct BinaryFlip { BinaryOpT binary_op; __device__ __host__ explicit BinaryFlip(BinaryOpT binary_op) : binary_op(binary_op) {} template __device__ auto operator()(T &&t, U &&u) -> decltype(binary_op(std::forward(u), std::forward(t))) { return binary_op(std::forward(u), std::forward(t)); } }; template __device__ __host__ BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) { return BinaryFlip(binary_op); } /** @} */ // end group UtilModule CUB_NAMESPACE_END cub-2.0.1/cub/thread/thread_reduce.cuh000066400000000000000000000141641434614775400176270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Thread utilities for sequential reduction over statically-sized array types */ #pragma once #include "../thread/thread_operators.cuh" #include "../detail/type_traits.cuh" #include "../config.cuh" CUB_NAMESPACE_BEGIN /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) namespace internal { /** * Sequential reduction over statically-sized array types */ template < int LENGTH, typename T, typename ReductionOp, typename PrefixT, typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator PrefixT prefix, ///< [in] Prefix to seed reduction with Int2Type /*length*/) { AccumT retval = prefix; #pragma unroll for (int i = 0; i < LENGTH; ++i) retval = reduction_op(retval, input[i]); return retval; } /** * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. * * \tparam LENGTH LengthT of input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ReductionOp, typename PrefixT, typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator PrefixT prefix) ///< [in] Prefix to seed reduction with { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } /** * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. * * \tparam LENGTH LengthT of input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ReductionOp> __device__ __forceinline__ T ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op) ///< [in] Binary reduction operator { T prefix = input[0]; return ThreadReduce(input + 1, reduction_op, prefix); } /** * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. * * \tparam LENGTH [inferred] LengthT of \p input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ReductionOp, typename PrefixT, typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T (&input)[LENGTH], ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator PrefixT prefix) ///< [in] Prefix to seed reduction with { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } /** * \brief Serial reduction with the specified operator * * \tparam LENGTH [inferred] LengthT of \p input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ReductionOp> __device__ __forceinline__ T ThreadReduce( T (&input)[LENGTH], ///< [in] Input array ReductionOp reduction_op) ///< [in] Binary reduction operator { return ThreadReduce((T*) input, reduction_op); } } // internal namespace CUB_NAMESPACE_END cub-2.0.1/cub/thread/thread_scan.cuh000066400000000000000000000243031434614775400173000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Thread utilities for sequential prefix scan over statically-sized array types */ #pragma once #include "../config.cuh" #include "../thread/thread_operators.cuh" CUB_NAMESPACE_BEGIN /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) namespace internal { /** * \addtogroup UtilModule * @{ */ /** * \name Sequential prefix scan over statically-sized array types * @{ */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanExclusive( T inclusive, T exclusive, T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator Int2Type /*length*/) { #pragma unroll for (int i = 0; i < LENGTH; ++i) { inclusive = scan_op(exclusive, input[i]); output[i] = exclusive; exclusive = inclusive; } return inclusive; } /** * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. * * \tparam LENGTH LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanExclusive( T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T prefix, ///< [in] Prefix to seed scan with bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) { T inclusive = input[0]; if (apply_prefix) { inclusive = scan_op(prefix, inclusive); } output[0] = prefix; T exclusive = inclusive; return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); } /** * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. * * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanExclusive( T (&input)[LENGTH], ///< [in] Input array T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T prefix, ///< [in] Prefix to seed scan with bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) { return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); } template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanInclusive( T inclusive, T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator Int2Type /*length*/) { #pragma unroll for (int i = 0; i < LENGTH; ++i) { inclusive = scan_op(inclusive, input[i]); output[i] = inclusive; } return inclusive; } /** * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. * * \tparam LENGTH LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanInclusive( T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan operator { T inclusive = input[0]; output[0] = inclusive; // Continue scan return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); } /** * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. * * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanInclusive( T (&input)[LENGTH], ///< [in] Input array T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) ScanOp scan_op) ///< [in] Binary scan operator { return ThreadScanInclusive((T*) input, (T*) output, scan_op); } /** * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. * * \tparam LENGTH LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanInclusive( T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T prefix, ///< [in] Prefix to seed scan with bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) { T inclusive = input[0]; if (apply_prefix) { inclusive = scan_op(prefix, inclusive); } output[0] = inclusive; // Continue scan return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); } /** * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. * * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template < int LENGTH, typename T, typename ScanOp> __device__ __forceinline__ T ThreadScanInclusive( T (&input)[LENGTH], ///< [in] Input array T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T prefix, ///< [in] Prefix to seed scan with bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) { return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); } //@} end member group /** @} */ // end group UtilModule } // internal namespace CUB_NAMESPACE_END cub-2.0.1/cub/thread/thread_search.cuh000066400000000000000000000127761434614775400176340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Thread utilities for sequential search */ #pragma once #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * Computes the begin offsets into A and B for the specific diagonal */ template < typename AIteratorT, typename BIteratorT, typename OffsetT, typename CoordinateT> __host__ __device__ __forceinline__ void MergePathSearch( OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT& path_coordinate) { /// The value type of the input iterator using T = cub::detail::value_t; OffsetT split_min = CUB_MAX(diagonal - b_len, 0); OffsetT split_max = CUB_MIN(diagonal, a_len); while (split_min < split_max) { OffsetT split_pivot = (split_min + split_max) >> 1; if (a[split_pivot] <= b[diagonal - split_pivot - 1]) { // Move candidate split range up A, down B split_min = split_pivot + 1; } else { // Move candidate split range up B, down A split_max = split_pivot; } } path_coordinate.x = CUB_MIN(split_min, a_len); path_coordinate.y = diagonal - split_min; } /** * \brief Returns the offset of the first value within \p input which does not compare less than \p val */ template < typename InputIteratorT, typename OffsetT, typename T> __device__ __forceinline__ OffsetT LowerBound( InputIteratorT input, ///< [in] Input sequence OffsetT num_items, ///< [in] Input sequence length T val) ///< [in] Search key { OffsetT retval = 0; while (num_items > 0) { OffsetT half = num_items >> 1; if (input[retval + half] < val) { retval = retval + (half + 1); num_items = num_items - (half + 1); } else { num_items = half; } } return retval; } /** * \brief Returns the offset of the first value within \p input which compares greater than \p val */ template < typename InputIteratorT, typename OffsetT, typename T> __device__ __forceinline__ OffsetT UpperBound( InputIteratorT input, ///< [in] Input sequence OffsetT num_items, ///< [in] Input sequence length T val) ///< [in] Search key { OffsetT retval = 0; while (num_items > 0) { OffsetT half = num_items >> 1; if (val < input[retval + half]) { num_items = half; } else { retval = retval + (half + 1); num_items = num_items - (half + 1); } } return retval; } #if defined(__CUDA_FP16_TYPES_EXIST__) template < typename InputIteratorT, typename OffsetT> __device__ __forceinline__ OffsetT UpperBound( InputIteratorT input, ///< [in] Input sequence OffsetT num_items, ///< [in] Input sequence length __half val) ///< [in] Search key { OffsetT retval = 0; while (num_items > 0) { OffsetT half = num_items >> 1; bool lt; NV_IF_TARGET(NV_PROVIDES_SM_53, (lt = val < input[retval + half];), (lt = __half2float(val) < __half2float(input[retval + half]);)); if (lt) { num_items = half; } else { retval = retval + (half + 1); num_items = num_items - (half + 1); } } return retval; } #endif CUB_NAMESPACE_END cub-2.0.1/cub/thread/thread_sort.cuh000066400000000000000000000067271434614775400173550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN template __device__ __forceinline__ void Swap(T &lhs, T &rhs) { T temp = lhs; lhs = rhs; rhs = temp; } /** * @brief Sorts data using odd-even sort method * * The sorting method is stable. Further details can be found in: * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972. * * @tparam KeyT * Key type * * @tparam ValueT * Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)` * * @tparam ITEMS_PER_THREAD * The number of items per thread * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second */ template __device__ __forceinline__ void StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op) { constexpr bool KEYS_ONLY = std::is_same::value; #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { #pragma unroll for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { if (compare_op(keys[j + 1], keys[j])) { Swap(keys[j], keys[j + 1]); if (!KEYS_ONLY) { Swap(items[j], items[j + 1]); } } } // inner loop } // outer loop } CUB_NAMESPACE_END cub-2.0.1/cub/thread/thread_store.cuh000066400000000000000000000422361434614775400175150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Thread utilities for writing memory using PTX cache modifiers. */ #pragma once #include #include #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilIo * @{ */ //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- /** * \brief Enumeration of cache modifiers for memory store operations. */ enum CacheStoreModifier { STORE_DEFAULT, ///< Default (no modifier) STORE_WB, ///< Cache write-back all coherent levels STORE_CG, ///< Cache at global level STORE_CS, ///< Cache streaming (likely to be accessed once) STORE_WT, ///< Cache write-through (to system memory) STORE_VOLATILE, ///< Volatile shared (any memory space) }; /** * \name Thread I/O (cache modified) * @{ */ /** * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. * * \par Example * \code * #include // or equivalently * * // 32-bit store using cache-global modifier: * int *d_out; * int val; * cub::ThreadStore(d_out + threadIdx.x, val); * * // 16-bit store using default modifier * short *d_out; * short val; * cub::ThreadStore(d_out + threadIdx.x, val); * * // 256-bit store using write-through modifier * double4 *d_out; * double4 val; * cub::ThreadStore(d_out + threadIdx.x, val); * * // 96-bit store using cache-streaming cache modifier * struct TestFoo { bool a; short b; }; * TestFoo *d_struct; * TestFoo val; * cub::ThreadStore(d_out + threadIdx.x, val); * \endcode * * \tparam MODIFIER [inferred] CacheStoreModifier enumeration * \tparam InputIteratorT [inferred] Output iterator type \iterator * \tparam T [inferred] Data type of output value */ template < CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T> __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Helper structure for templated store iteration (inductive case) template struct IterateThreadStore { template static __device__ __forceinline__ void Store(T *ptr, T *vals) { ThreadStore(ptr + COUNT, vals[COUNT]); IterateThreadStore::template Store(ptr, vals); } template static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) { ptr[COUNT] = vals[COUNT]; IterateThreadStore::Dereference(ptr, vals); } }; /// Helper structure for templated store iteration (termination case) template struct IterateThreadStore { template static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} template static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} }; /** * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier */ #define _CUB_STORE_16(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ { \ asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ _CUB_ASM_PTR_(ptr), \ "r"(val.x), \ "r"(val.y), \ "r"(val.z), \ "r"(val.w)); \ } \ template<> \ __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ { \ asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ _CUB_ASM_PTR_(ptr), \ "l"(val.x), \ "l"(val.y)); \ } /** * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier */ #define _CUB_STORE_8(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ { \ asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ _CUB_ASM_PTR_(ptr), \ "h"(val.x), \ "h"(val.y), \ "h"(val.z), \ "h"(val.w)); \ } \ template<> \ __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ { \ asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ _CUB_ASM_PTR_(ptr), \ "r"(val.x), \ "r"(val.y)); \ } \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ { \ asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ _CUB_ASM_PTR_(ptr), \ "l"(val)); \ } /** * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier */ #define _CUB_STORE_4(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ { \ asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ _CUB_ASM_PTR_(ptr), \ "r"(val)); \ } /** * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier */ #define _CUB_STORE_2(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ { \ asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ _CUB_ASM_PTR_(ptr), \ "h"(val)); \ } /** * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier */ #define _CUB_STORE_1(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ { \ asm volatile ( \ "{" \ " .reg .u8 datum;" \ " cvt.u8.u16 datum, %1;" \ " st."#ptx_modifier".u8 [%0], datum;" \ "}" : : \ _CUB_ASM_PTR_(ptr), \ "h"((unsigned short) val)); \ } /** * Define powers-of-two ThreadStore specializations for the given Cache load modifier */ #define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ _CUB_STORE_16(cub_modifier, ptx_modifier) \ _CUB_STORE_8(cub_modifier, ptx_modifier) \ _CUB_STORE_4(cub_modifier, ptx_modifier) \ _CUB_STORE_2(cub_modifier, ptx_modifier) \ _CUB_STORE_1(cub_modifier, ptx_modifier) \ /** * Define ThreadStore specializations for the various Cache load modifiers */ _CUB_STORE_ALL(STORE_WB, wb) _CUB_STORE_ALL(STORE_CG, cg) _CUB_STORE_ALL(STORE_CS, cs) _CUB_STORE_ALL(STORE_WT, wt) // Macro cleanup #undef _CUB_STORE_ALL #undef _CUB_STORE_1 #undef _CUB_STORE_2 #undef _CUB_STORE_4 #undef _CUB_STORE_8 #undef _CUB_STORE_16 /** * ThreadStore definition for STORE_DEFAULT modifier on iterator types */ template __device__ __forceinline__ void ThreadStore( OutputIteratorT itr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { *itr = val; } /** * ThreadStore definition for STORE_DEFAULT modifier on pointer types */ template __device__ __forceinline__ void ThreadStore( T *ptr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { *ptr = val; } /** * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types */ template __device__ __forceinline__ void ThreadStoreVolatilePtr( T *ptr, T val, Int2Type /*is_primitive*/) { *reinterpret_cast(ptr) = val; } /** * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types */ template __device__ __forceinline__ void ThreadStoreVolatilePtr( T *ptr, T val, Int2Type /*is_primitive*/) { // Create a temporary using shuffle-words, then store using volatile-words typedef typename UnitWord::VolatileWord VolatileWord; typedef typename UnitWord::ShuffleWord ShuffleWord; const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); VolatileWord words[VOLATILE_MULTIPLE]; #pragma unroll for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( reinterpret_cast(ptr), words); } /** * ThreadStore definition for STORE_VOLATILE modifier on pointer types */ template __device__ __forceinline__ void ThreadStore( T *ptr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); } /** * ThreadStore definition for generic modifiers on pointer types */ template __device__ __forceinline__ void ThreadStore( T *ptr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { // Create a temporary using shuffle-words, then store using device-words typedef typename UnitWord::DeviceWord DeviceWord; typedef typename UnitWord::ShuffleWord ShuffleWord; const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); DeviceWord words[DEVICE_MULTIPLE]; #pragma unroll for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( reinterpret_cast(ptr), words); } /** * ThreadStore definition for generic modifiers */ template __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) { ThreadStore( itr, val, Int2Type(), Int2Type::value>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** @} */ // end group UtilIo CUB_NAMESPACE_END cub-2.0.1/cub/util_allocator.cuh000066400000000000000000000706001434614775400165740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple caching allocator for device memory allocations. The allocator is * thread-safe and capable of managing device allocations on multiple devices. ******************************************************************************/ #pragma once #include "util_namespace.cuh" #include "util_debug.cuh" #include #include #include "host/mutex.cuh" #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilMgmt * @{ */ /****************************************************************************** * CachingDeviceAllocator (host use) ******************************************************************************/ /** * \brief A simple caching allocator for device memory allocations. * * \par Overview * The allocator is thread-safe and stream-safe and is capable of managing cached * device allocations on multiple devices. It behaves as follows: * * \par * - Allocations from the allocator are associated with an \p active_stream. Once freed, * the allocation becomes available immediately for reuse within the \p active_stream * with which it was associated with during allocation, and it becomes available for * reuse within other streams when all prior work submitted to \p active_stream has completed. * - Allocations are categorized and cached by bin size. A new allocation request of * a given size will only consider cached allocations within the corresponding bin. * - Bin limits progress geometrically in accordance with the growth factor * \p bin_growth provided during construction. Unused device allocations within * a larger bin cache are not reused for allocation requests that categorize to * smaller bin sizes. * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to * (\p bin_growth ^ \p min_bin). * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest * bin and are simply freed when they are deallocated instead of being returned * to a bin-cache. * - If the total storage of cached allocations on a given device will exceed * \p max_cached_bytes, allocations for that device are simply freed when they are * deallocated instead of being returned to their bin-cache. * * \par * For example, the default-constructed CachingDeviceAllocator is configured with: * - \p bin_growth = 8 * - \p min_bin = 3 * - \p max_bin = 7 * - \p max_cached_bytes = 6MB - 1B * * \par * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB * and sets a maximum of 6,291,455 cached bytes per device * */ struct CachingDeviceAllocator { //--------------------------------------------------------------------- // Constants //--------------------------------------------------------------------- /// Out-of-bounds bin static const unsigned int INVALID_BIN = (unsigned int) -1; /// Invalid size static const size_t INVALID_SIZE = (size_t) -1; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Invalid device ordinal static const int INVALID_DEVICE_ORDINAL = -1; //--------------------------------------------------------------------- // Type definitions and helper types //--------------------------------------------------------------------- /** * Descriptor for device memory allocations */ struct BlockDescriptor { void* d_ptr; // Device pointer size_t bytes; // Size of allocation in bytes unsigned int bin; // Bin enumeration int device; // device ordinal cudaStream_t associated_stream; // Associated associated_stream cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed // Constructor (suitable for searching maps for a specific block, given its pointer and device) BlockDescriptor(void *d_ptr, int device) : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(0), ready_event(0) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) BlockDescriptor(int device) : d_ptr(NULL), bytes(0), bin(INVALID_BIN), device(device), associated_stream(0), ready_event(0) {} // Comparison functor for comparing device pointers static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { if (a.device == b.device) return (a.d_ptr < b.d_ptr); else return (a.device < b.device); } // Comparison functor for comparing allocation sizes static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { if (a.device == b.device) return (a.bytes < b.bytes); else return (a.device < b.device); } }; /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); class TotalBytes { public: size_t free; size_t live; TotalBytes() { free = live = 0; } }; /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; /// Set type for live blocks (ordered by ptr) typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device typedef std::map GpuCachedBytes; //--------------------------------------------------------------------- // Utility functions //--------------------------------------------------------------------- /** * Integer pow function for unsigned base and exponent */ static unsigned int IntPow( unsigned int base, unsigned int exp) { unsigned int retval = 1; while (exp > 0) { if (exp & 1) { retval = retval * base; // multiply the result by the current base } base = base * base; // square the base exp = exp >> 1; // divide the exponent in half } return retval; } /** * Round up to the nearest power-of */ void NearestPowerOf( unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) { power = 0; rounded_bytes = 1; if (value * base < value) { // Overflow power = sizeof(size_t) * 8; rounded_bytes = size_t(0) - 1; return; } while (rounded_bytes < value) { rounded_bytes *= base; power++; } } //--------------------------------------------------------------------- // Fields //--------------------------------------------------------------------- cub::Mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration unsigned int max_bin; /// Maximum bin enumeration size_t min_bin_bytes; /// Minimum bin size size_t max_bin_bytes; /// Maximum bin size size_t max_cached_bytes; /// Maximum aggregate cached bytes per device const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) bool debug; /// Whether or not to print (de)allocation events to stdout GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse BusyBlocks live_blocks; /// Set of live device allocations currently in use #endif // DOXYGEN_SHOULD_SKIP_THIS //--------------------------------------------------------------------- // Methods //--------------------------------------------------------------------- /** * \brief Constructor. */ CachingDeviceAllocator( unsigned int bin_growth, ///< Geometric growth factor for bin-sizes unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) : bin_growth(bin_growth), min_bin(min_bin), max_bin(max_bin), min_bin_bytes(IntPow(bin_growth, min_bin)), max_bin_bytes(IntPow(bin_growth, max_bin)), max_cached_bytes(max_cached_bytes), skip_cleanup(skip_cleanup), debug(debug), cached_blocks(BlockDescriptor::SizeCompare), live_blocks(BlockDescriptor::PtrCompare) {} /** * \brief Default constructor. * * Configured with: * \par * - \p bin_growth = 8 * - \p min_bin = 3 * - \p max_bin = 7 * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes * * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and * sets a maximum of 6,291,455 cached bytes per device */ CachingDeviceAllocator( bool skip_cleanup = false, bool debug = false) : bin_growth(8), min_bin(3), max_bin(7), min_bin_bytes(IntPow(bin_growth, min_bin)), max_bin_bytes(IntPow(bin_growth, max_bin)), max_cached_bytes((max_bin_bytes * 3) - 1), skip_cleanup(skip_cleanup), debug(debug), cached_blocks(BlockDescriptor::SizeCompare), live_blocks(BlockDescriptor::PtrCompare) {} /** * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. * * Changing the ceiling of cached bytes does not cause any allocations (in-use or * cached-in-reserve) to be freed. See \p FreeAllCached(). */ cudaError_t SetMaxCachedBytes(size_t max_cached_bytes_) { // Lock mutex.Lock(); if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_); this->max_cached_bytes = max_cached_bytes_; // Unlock mutex.Unlock(); return cudaSuccess; } /** * \brief Provides a suitable allocation of device memory for the given size on the specified device. * * Once freed, the allocation becomes available immediately for reuse within the \p active_stream * with which it was associated with during allocation, and it becomes available for reuse within other * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceAllocate( int device, ///< [in] Device on which to place the allocation void **d_ptr, ///< [out] Reference to pointer to the allocation size_t bytes, ///< [in] Minimum number of bytes for the allocation cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation { *d_ptr = NULL; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; if (device == INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; device = entrypoint_device; } // Create a block descriptor for the requested allocation bool found = false; BlockDescriptor search_key(device); search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); if (search_key.bin > max_bin) { // Bin is greater than our maximum bin: allocate the request // exactly and give out-of-bounds bin. It will not be cached // for reuse when returned. search_key.bin = INVALID_BIN; search_key.bytes = bytes; } else { // Search for a suitable cached allocation: lock mutex.Lock(); if (search_key.bin < min_bin) { // Bin is less than minimum bin: round up search_key.bin = min_bin; search_key.bytes = min_bin_bytes; } // Iterate through the range of cached blocks on the same device in the same bin CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); while ((block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { // To prevent races with reusing blocks returned by the host but still // in use by the device, only consider cached blocks that are // either (from the active stream) or (from an idle stream) bool is_reusable = false; if (active_stream == block_itr->associated_stream) { is_reusable = true; } else { const cudaError_t event_status = cudaEventQuery(block_itr->ready_event); if(event_status != cudaErrorNotReady) { CubDebug(event_status); is_reusable = true; } } if(is_reusable) { // Reuse existing cache block. Insert into live blocks. found = true; search_key = *block_itr; search_key.associated_stream = active_stream; live_blocks.insert(search_key); // Remove from free blocks cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); cached_blocks.erase(block_itr); break; } block_itr++; } // Done searching: unlock mutex.Unlock(); } // Allocate the block if necessary if (!found) { // Set runtime's current device to specified device (entrypoint may not be set) if (device != entrypoint_device) { if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; if (CubDebug(error = cudaSetDevice(device))) return error; } // Attempt to allocate if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", device, (long long) search_key.bytes, (long long) search_key.associated_stream); error = cudaSuccess; // Reset the error we will return cudaGetLastError(); // Reset CUDART's error // Lock mutex.Lock(); // Iterate the range of free blocks on the same device BlockDescriptor free_key(device); CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) { // No need to worry about synchronization with the device: cudaFree is // blocking and will synchronize across all kernels executing // on the current device // Free device memory and destroy stream event. if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); cached_blocks.erase(block_itr); block_itr++; } // Unlock mutex.Unlock(); // Return under error if (error) return error; // Try to allocate again if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; } // Create ready event if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) return error; // Insert into live blocks mutex.Lock(); live_blocks.insert(search_key); cached_bytes[device].live += search_key.bytes; mutex.Unlock(); if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); // Attempt to revert back to previous device if necessary if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } } // Copy device pointer to output parameter *d_ptr = search_key.d_ptr; if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); return error; } /** * \brief Provides a suitable allocation of device memory for the given size on the current device. * * Once freed, the allocation becomes available immediately for reuse within the \p active_stream * with which it was associated with during allocation, and it becomes available for reuse within other * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceAllocate( void **d_ptr, ///< [out] Reference to pointer to the allocation size_t bytes, ///< [in] Minimum number of bytes for the allocation cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation { return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); } /** * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. * * Once freed, the allocation becomes available immediately for reuse within the \p active_stream * with which it was associated with during allocation, and it becomes available for reuse within other * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceFree( int device, void* d_ptr) { int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; if (device == INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; device = entrypoint_device; } // Lock mutex.Lock(); // Find corresponding block descriptor bool recached = false; BlockDescriptor search_key(d_ptr, device); BusyBlocks::iterator block_itr = live_blocks.find(search_key); if (block_itr != live_blocks.end()) { // Remove from live blocks search_key = *block_itr; live_blocks.erase(block_itr); cached_bytes[device].live -= search_key.bytes; // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { // Insert returned allocation into free blocks recached = true; cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); } } // Unlock mutex.Unlock(); // First set to specified device (entrypoint may not be set) if (device != entrypoint_device) { if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; if (CubDebug(error = cudaSetDevice(device))) return error; } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; } if (!recached) { // Free the allocation from the runtime and cleanup the event. if (CubDebug(error = cudaFree(d_ptr))) return error; if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); } // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } return error; } /** * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. * * Once freed, the allocation becomes available immediately for reuse within the \p active_stream * with which it was associated with during allocation, and it becomes available for reuse within other * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceFree( void* d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); } /** * \brief Frees all cached device allocations on all devices */ cudaError_t FreeAllCached() { cudaError_t error = cudaSuccess; int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; mutex.Lock(); while (!cached_blocks.empty()) { // Get first block CachedBlocks::iterator begin = cached_blocks.begin(); // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; } // Set current device ordinal if necessary if (begin->device != current_device) { if (CubDebug(error = cudaSetDevice(begin->device))) break; current_device = begin->device; } // Free device memory if (CubDebug(error = cudaFree(begin->d_ptr))) break; if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; // Reduce balance and erase entry const size_t block_bytes = begin->bytes; cached_bytes[current_device].free -= block_bytes; cached_blocks.erase(begin); if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", current_device, (long long) block_bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); } mutex.Unlock(); // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } return error; } /** * \brief Destructor */ virtual ~CachingDeviceAllocator() { if (!skip_cleanup) FreeAllCached(); } }; /** @} */ // end group UtilMgmt CUB_NAMESPACE_END cub-2.0.1/cub/util_arch.cuh000066400000000000000000000141251434614775400155310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Static architectural properties by SM version. */ #pragma once #include #include #include // Legacy include; this functionality used to be defined in here. #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document #if ((__CUDACC_VER_MAJOR__ >= 9) || defined(_NVHPC_CUDA) || \ CUDA_VERSION >= 9000) && \ !defined(CUB_USE_COOPERATIVE_GROUPS) #define CUB_USE_COOPERATIVE_GROUPS #endif /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. #ifndef CUB_PTX_ARCH #if defined(_NVHPC_CUDA) // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined // when compiling both host code and device code. Currently, only one // PTX version can be targeted. #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ #elif !defined(__CUDA_ARCH__) #define CUB_PTX_ARCH 0 #else #define CUB_PTX_ARCH __CUDA_ARCH__ #endif #endif // These definitions were intended for internal use only and are now obsolete. // If you relied on them, consider porting your code to use the functionality // in libcu++'s header. // For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make // them available again. These should be considered deprecated and will be // fully removed in a future version. #ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS #ifndef CUB_IS_DEVICE_CODE #if defined(_NVHPC_CUDA) #define CUB_IS_DEVICE_CODE __builtin_is_device_code() #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) #define CUB_INCLUDE_DEVICE_CODE 1 #define CUB_INCLUDE_HOST_CODE 1 #elif CUB_PTX_ARCH > 0 #define CUB_IS_DEVICE_CODE 1 #define CUB_IS_HOST_CODE 0 #define CUB_INCLUDE_DEVICE_CODE 1 #define CUB_INCLUDE_HOST_CODE 0 #else #define CUB_IS_DEVICE_CODE 0 #define CUB_IS_HOST_CODE 1 #define CUB_INCLUDE_DEVICE_CODE 0 #define CUB_INCLUDE_HOST_CODE 1 #endif #endif #endif // CUB_PROVIDE_LEGACY_ARCH_MACROS /// Maximum number of devices supported. #ifndef CUB_MAX_DEVICES #define CUB_MAX_DEVICES (128) #endif static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); /// Number of threads per warp #ifndef CUB_LOG_WARP_THREADS #define CUB_LOG_WARP_THREADS(unused) (5) #define CUB_WARP_THREADS(unused) (1 << CUB_LOG_WARP_THREADS(0)) #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(0) #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(0) #endif /// Number of smem banks #ifndef CUB_LOG_SMEM_BANKS #define CUB_LOG_SMEM_BANKS(unused) (5) #define CUB_SMEM_BANKS(unused) (1 << CUB_LOG_SMEM_BANKS(0)) #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(0) #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS #endif /// Oversubscription factor #ifndef CUB_SUBSCRIPTION_FACTOR #define CUB_SUBSCRIPTION_FACTOR(unused) (5) #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(0) #endif /// Prefer padding overhead vs X-way conflicts greater than this threshold #ifndef CUB_PREFER_CONFLICT_OVER_PADDING #define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1) #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0) #endif template < int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T> struct RegBoundScaling { enum { ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), }; }; template < int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T> struct MemBoundScaling { enum { ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), }; }; #endif // Do not document CUB_NAMESPACE_END cub-2.0.1/cub/util_compiler.cuh000066400000000000000000000070741434614775400164330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Detect compiler information. */ #pragma once // enumerate host compilers we know about #define CUB_HOST_COMPILER_UNKNOWN 0 #define CUB_HOST_COMPILER_MSVC 1 #define CUB_HOST_COMPILER_GCC 2 #define CUB_HOST_COMPILER_CLANG 3 // enumerate device compilers we know about #define CUB_DEVICE_COMPILER_UNKNOWN 0 #define CUB_DEVICE_COMPILER_MSVC 1 #define CUB_DEVICE_COMPILER_GCC 2 #define CUB_DEVICE_COMPILER_NVCC 3 #define CUB_DEVICE_COMPILER_CLANG 4 // figure out which host compiler we're using #if defined(_MSC_VER) # define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC # define CUB_MSVC_VERSION _MSC_VER # define CUB_MSVC_VERSION_FULL _MSC_FULL_VER #elif defined(__clang__) # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG # define CUB_CLANG_VERSION \ (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) #elif defined(__GNUC__) # define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC # define CUB_GCC_VERSION \ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #else # define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN #endif // CUB_HOST_COMPILER // figure out which device compiler we're using #if defined(__CUDACC__) || defined(_NVHPC_CUDA) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG // CUDA-capable clang should behave similar to NVCC. # if defined(__CUDA__) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC # else # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG # endif #else # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN #endif cub-2.0.1/cub/util_cpp_dialect.cuh000066400000000000000000000142751434614775400170710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /*! \file * \brief Detect the version of the C++ standard used by the compiler. */ #pragma once #include "util_compiler.cuh" // Deprecation warnings may be silenced by defining the following macros. These // may be combined. // - CUB_IGNORE_DEPRECATED_CPP_DIALECT: // Ignore all deprecated C++ dialects and outdated compilers. // - CUB_IGNORE_DEPRECATED_CPP_11: // Ignore deprecation warnings when compiling with C++11. C++03 and outdated // compilers will still issue warnings. // - CUB_IGNORE_DEPRECATED_COMPILER // Ignore deprecation warnings when using deprecated compilers. Compiling // with C++03 and C++11 will still issue warnings. // Check for the thrust opt-outs as well: #if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \ defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) # define CUB_IGNORE_DEPRECATED_CPP_DIALECT #endif #if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \ defined(THRUST_IGNORE_DEPRECATED_CPP_11) # define CUB_IGNORE_DEPRECATED_CPP_11 #endif #if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \ defined(THRUST_IGNORE_DEPRECATED_COMPILER) # define CUB_IGNORE_DEPRECATED_COMPILER #endif #ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT # define CUB_IGNORE_DEPRECATED_CPP_11 # define CUB_IGNORE_DEPRECATED_COMPILER #endif // Define this to override the built-in detection. #ifndef CUB_CPP_DIALECT // MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead. // This macro is only defined in MSVC 2015U3+. # ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above) // MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11. # if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */ # define CUB_CPLUSPLUS 201103L /* Fix to 2011 */ # else # define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */ # endif // MSVC 2015 C++14 fix # else # define CUB_CPLUSPLUS __cplusplus # endif // Detect current dialect: # if CUB_CPLUSPLUS < 201103L # define CUB_CPP_DIALECT 2003 # elif CUB_CPLUSPLUS < 201402L # define CUB_CPP_DIALECT 2011 # elif CUB_CPLUSPLUS < 201703L # define CUB_CPP_DIALECT 2014 # elif CUB_CPLUSPLUS == 201703L # define CUB_CPP_DIALECT 2017 # elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017. # define CUB_CPP_DIALECT 2020 # endif # undef CUB_CPLUSPLUS // cleanup #endif // !CUB_CPP_DIALECT // Define CUB_COMPILER_DEPRECATION macro: #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC # define CUB_COMP_DEPR_IMPL(msg) \ __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg)) # define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x) # define CUB_COMP_DEPR_IMPL1(x) #x #else // clang / gcc: # define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg) # define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr) # define CUB_COMP_DEPR_IMPL1 /* intentionally blank */ #endif #define CUB_COMPILER_DEPRECATION(REQ) \ CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) #define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \ CUB_COMP_DEPR_IMPL(CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a future release. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) #ifndef CUB_IGNORE_DEPRECATED_COMPILER // Compiler checks: # if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000 CUB_COMPILER_DEPRECATION(GCC 5.0); # elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 70000 CUB_COMPILER_DEPRECATION(Clang 7.0); # elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910 // <2017. Hard upgrade message: CUB_COMPILER_DEPRECATION(MSVC 2019 (19.20/16.0/14.20)); # elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1920 // >=2017, <2019. Soft deprecation message: CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019 (19.20/16.0/14.20), MSVC 2017); # endif #endif // CUB_IGNORE_DEPRECATED_COMPILER #ifndef CUB_IGNORE_DEPRECATED_DIALECT // Dialect checks: # if CUB_CPP_DIALECT < 2011 // #include #include #include CUB_NAMESPACE_BEGIN #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: /** * @def CUB_DEBUG_LOG * * Causes kernel launch configurations to be printed to the console */ #define CUB_DEBUG_LOG /** * @def CUB_DEBUG_SYNC * * Causes synchronization of the stream after every kernel launch to check * for errors. Also causes kernel launch configurations to be printed to the * console. */ #define CUB_DEBUG_SYNC /** * @def CUB_DEBUG_HOST_ASSERTIONS * * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition * assertions. */ #define CUB_DEBUG_HOST_ASSERTIONS /** * @def CUB_DEBUG_DEVICE_ASSERTIONS * * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side * precondition assertions. */ #define CUB_DEBUG_DEVICE_ASSERTIONS /** * @def CUB_DEBUG_ALL * * Causes host and device-side precondition assertions to be checked. Apart * from that, causes synchronization of the stream after every kernel launch to * check for errors. Also causes kernel launch configurations to be printed to * the console. */ #define CUB_DEBUG_ALL #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \addtogroup UtilMgmt * @{ */ // `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only: #define CUB_DETAIL_DEBUG_LEVEL_NONE 0 #define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1 #define CUB_DETAIL_DEBUG_LEVEL_LOG 2 #define CUB_DETAIL_DEBUG_LEVEL_SYNC 3 #define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS 4 #define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS 5 #define CUB_DETAIL_DEBUG_LEVEL_ALL 1000 // `CUB_DEBUG_*`: User interfaces: // Extra logging, no syncs #ifdef CUB_DEBUG_LOG #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG #endif // Logging + syncs #ifdef CUB_DEBUG_SYNC #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC #endif // Logging + syncs + host assertions #ifdef CUB_DEBUG_HOST_ASSERTIONS #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS #endif // Logging + syncs + host assertions + device assertions #ifdef CUB_DEBUG_DEVICE_ASSERTIONS #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS #endif // All #ifdef CUB_DEBUG_ALL #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL #endif // Default case, no extra debugging: #ifndef CUB_DETAIL_DEBUG_LEVEL #ifdef NDEBUG #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE #else #define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY #endif #endif /* * `CUB_DETAIL_DEBUG_ENABLE_*`: * Internal implementation details, used for testing enabled debug features: */ #if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG #define CUB_DETAIL_DEBUG_ENABLE_LOG #endif #if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC #define CUB_DETAIL_DEBUG_ENABLE_SYNC #endif #if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) || \ (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY) #define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS #endif #if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS #define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS #endif /// CUB error reporting macro (prints error messages to stderr) #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) #define CUB_STDERR #endif /** * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the * corresponding error message is printed to \p stderr (or \p stdout in device * code) along with the supplied source context. * * \return The CUDA error. */ __host__ __device__ __forceinline__ cudaError_t Debug(cudaError_t error, const char *filename, int line) { // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. // clang-format off #ifndef CUB_RDC_ENABLED #define CUB_TEMP_DEVICE_CODE #else #define CUB_TEMP_DEVICE_CODE cudaGetLastError() #endif NV_IF_TARGET( NV_IS_HOST, (cudaGetLastError();), (CUB_TEMP_DEVICE_CODE;) ); #undef CUB_TEMP_DEVICE_CODE // clang-format on #ifdef CUB_STDERR if (error) { NV_IF_TARGET( NV_IS_HOST, ( fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); fflush(stderr); ), ( printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); ) ); } #else (void)filename; (void)line; #endif return error; } /** * \brief Debug macro */ #ifndef CubDebug #define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__) #endif /** * \brief Debug macro with exit */ #ifndef CubDebugExit #define CubDebugExit(e) if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } #endif /** * \brief Log macro for printf statements. */ #if !defined(_CubLog) #if defined(_NVHPC_CUDA) || !(defined(__clang__) && defined(__CUDA__)) // NVCC / NVC++ #define _CubLog(format, ...) \ do \ { \ NV_IF_TARGET(NV_IS_HOST, \ (printf(format, __VA_ARGS__);), \ (printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ blockIdx.z, \ blockIdx.y, \ blockIdx.x, \ threadIdx.z, \ threadIdx.y, \ threadIdx.x, \ __VA_ARGS__);)); \ } while (false) #else // Clang: // XXX shameless hack for clang around variadic printf... // Compilies w/o supplying -std=c++11 but shows warning, // so we silence them :) #pragma clang diagnostic ignored "-Wc++11-extensions" #pragma clang diagnostic ignored "-Wunnamed-type-template-args" template inline __host__ __device__ void va_printf(char const *format, Args const &...args) { #ifdef __CUDA_ARCH__ printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); #else printf(format, args...); #endif } #ifndef __CUDA_ARCH__ #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format, __VA_ARGS__); #else #define _CubLog(format, ...) \ CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread " \ "(%d,%d,%d)]: " format, \ __VA_ARGS__); #endif #endif #endif /** @} */ // end group UtilMgmt CUB_NAMESPACE_END cub-2.0.1/cub/util_deprecated.cuh000066400000000000000000000072251434614775400167170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Define CUB_DEPRECATED macro. */ #pragma once #include #include #include #include #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API) # define CUB_IGNORE_DEPRECATED_API #endif #ifdef CUB_IGNORE_DEPRECATED_API # define CUB_DEPRECATED # define CUB_DEPRECATED_BECAUSE(MSG) #elif CUB_CPP_DIALECT >= 2014 # define CUB_DEPRECATED [[deprecated]] # define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]] #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC # define CUB_DEPRECATED __declspec(deprecated) # define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG)) #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG # define CUB_DEPRECATED __attribute__((deprecated)) # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC # define CUB_DEPRECATED __attribute__((deprecated)) # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) #else # define CUB_DEPRECATED # define CUB_DEPRECATED_BECAUSE(MSG) #endif #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED \ CUB_DEPRECATED_BECAUSE( \ "CUB no longer accepts `debug_synchronous` parameter. " \ "Define CUB_DEBUG_SYNC instead, or silence this message with " \ "CUB_IGNORE_DEPRECATED_API.") #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG \ if (debug_synchronous) \ { \ _CubLog("%s\n", \ "CUB no longer accepts `debug_synchronous` parameter. " \ "Define CUB_DEBUG_SYNC instead."); \ } cub-2.0.1/cub/util_device.cuh000066400000000000000000000547361434614775400160670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Properties of a given CUDA device and the corresponding PTX bundle */ #pragma once #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilMgmt * @{ */ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). */ template __host__ __device__ __forceinline__ cudaError_t AliasTemporaries( void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed { const int ALIGN_BYTES = 256; const int ALIGN_MASK = ~(ALIGN_BYTES - 1); // Compute exclusive prefix sum over allocation requests size_t allocation_offsets[ALLOCATIONS]; size_t bytes_needed = 0; for (int i = 0; i < ALLOCATIONS; ++i) { size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; allocation_offsets[i] = bytes_needed; bytes_needed += allocation_bytes; } bytes_needed += ALIGN_BYTES - 1; // Check if the caller is simply requesting the size of the storage allocation if (!d_temp_storage) { temp_storage_bytes = bytes_needed; return cudaSuccess; } // Check if enough storage provided if (temp_storage_bytes < bytes_needed) { return CubDebug(cudaErrorInvalidValue); } // Alias d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); for (int i = 0; i < ALLOCATIONS; ++i) { allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; } return cudaSuccess; } /** * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device */ template __global__ void EmptyKernel(void) { } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Returns the current device or -1 if an error occurred. */ CUB_RUNTIME_FUNCTION inline int CurrentDevice() { int device = -1; if (CubDebug(cudaGetDevice(&device))) return -1; return device; } /** * \brief RAII helper which saves the current device and switches to the * specified device on construction and switches to the saved device on * destruction. */ struct SwitchDevice { private: int const old_device; bool const needs_reset; public: __host__ inline SwitchDevice(int new_device) : old_device(CurrentDevice()), needs_reset(old_device != new_device) { if (needs_reset) CubDebug(cudaSetDevice(new_device)); } __host__ inline ~SwitchDevice() { if (needs_reset) CubDebug(cudaSetDevice(old_device)); } }; /** * \brief Returns the number of CUDA devices available or -1 if an error * occurred. */ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() { int count = -1; if (CubDebug(cudaGetDeviceCount(&count))) // CUDA makes no guarantees about the state of the output parameter if // `cudaGetDeviceCount` fails; in practice, they don't, but out of // paranoia we'll reset `count` to `-1`. count = -1; return count; } /** * \brief Cache for an arbitrary value produced by a nullary function. */ template struct ValueCache { T const value; /** * \brief Call the nullary function to produce the value and construct the * cache. */ __host__ inline ValueCache() : value(Function()) {} }; // Host code, only safely usable in C++11 or newer, where thread-safe // initialization of static locals is guaranteed. This is a separate function // to avoid defining a local static in a host/device function. __host__ inline int DeviceCountCachedValue() { static ValueCache cache; return cache.value; } /** * \brief Returns the number of CUDA devices available. * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline int DeviceCount() { int result = -1; NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();)); return result; } /** * \brief Per-device cache for a CUDA attribute value; the attribute is queried * and stored for each device upon construction. */ struct PerDeviceAttributeCache { struct DevicePayload { int attribute; cudaError_t error; }; // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the // `DeviceEntryInitializing` state, and then proceeds to the // `DeviceEntryReady` state. These are the only state transitions allowed; // e.g. a linear sequence of transitions. enum DeviceEntryStatus { DeviceEntryEmpty = 0, DeviceEntryInitializing, DeviceEntryReady }; struct DeviceEntry { std::atomic flag; DevicePayload payload; }; private: std::array entries_; public: /** * \brief Construct the cache. */ __host__ inline PerDeviceAttributeCache() : entries_() { assert(DeviceCount() <= CUB_MAX_DEVICES); } /** * \brief Retrieves the payload of the cached function \p f for \p device. * * \note You must pass a morally equivalent function in to every call or * this function has undefined behavior. */ template __host__ DevicePayload operator()(Invocable&& f, int device) { if (device >= DeviceCount()) return DevicePayload{0, cudaErrorInvalidDevice}; auto& entry = entries_[device]; auto& flag = entry.flag; auto& payload = entry.payload; DeviceEntryStatus old_status = DeviceEntryEmpty; // First, check for the common case of the entry being ready. if (flag.load(std::memory_order_acquire) != DeviceEntryReady) { // Assume the entry is empty and attempt to lock it so we can fill // it by trying to set the state from `DeviceEntryReady` to // `DeviceEntryInitializing`. if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing, std::memory_order_acq_rel, std::memory_order_acquire)) { // We successfully set the state to `DeviceEntryInitializing`; // we have the lock and it's our job to initialize this entry // and then release it. // We don't use `CubDebug` here because we let the user code // decide whether or not errors are hard errors. payload.error = std::forward(f)(payload.attribute); if (payload.error) // Clear the global CUDA error state which may have been // set by the last call. Otherwise, errors may "leak" to // unrelated kernel launches. cudaGetLastError(); // Release the lock by setting the state to `DeviceEntryReady`. flag.store(DeviceEntryReady, std::memory_order_release); } // If the `compare_exchange_weak` failed, then `old_status` has // been updated with the value of `flag` that it observed. else if (old_status == DeviceEntryInitializing) { // Another execution agent is initializing this entry; we need // to wait for them to finish; we'll know they're done when we // observe the entry status as `DeviceEntryReady`. do { old_status = flag.load(std::memory_order_acquire); } while (old_status != DeviceEntryReady); // FIXME: Use `atomic::wait` instead when we have access to // host-side C++20 atomics. We could use libcu++, but it only // supports atomics for SM60 and up, even if you're only using // them in host code. } } // We now know that the state of our entry is `DeviceEntryReady`, so // just return the entry's payload. return entry.payload; } }; /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) { // Instantiate `EmptyKernel` in both host and device code to ensure // it can be called. typedef void (*EmptyKernelPtr)(); EmptyKernelPtr empty_kernel = EmptyKernel; // This is necessary for unused variable warnings in host compilers. The // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015. (void)reinterpret_cast(empty_kernel); // Define a temporary macro that expands to the current target ptx version // in device code. // may provide an abstraction for this eventually. For now, // we have to keep this usage of __CUDA_ARCH__. #if defined(_NVHPC_CUDA) #define CUB_TEMP_GET_PTX __builtin_current_device_sm() #else #define CUB_TEMP_GET_PTX __CUDA_ARCH__ #endif cudaError_t result = cudaSuccess; NV_IF_TARGET( NV_IS_HOST, ( cudaFuncAttributes empty_kernel_attrs; result = cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast(empty_kernel)); CubDebug(result); ptx_version = empty_kernel_attrs.ptxVersion * 10; ), // NV_IS_DEVICE ( // This is necessary to ensure instantiation of EmptyKernel in device // code. The `reinterpret_cast` is necessary to suppress a // set-but-unused warnings. This is a meme now: // https://twitter.com/blelbach/status/1222391615576100864 (void)reinterpret_cast(empty_kernel); ptx_version = CUB_TEMP_GET_PTX; )); #undef CUB_TEMP_GET_PTX return result; } /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). */ __host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device) { SwitchDevice sd(device); (void)sd; return PtxVersionUncached(ptx_version); } template __host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() { // C++11 guarantees that initialization of static locals is thread safe. static PerDeviceAttributeCache cache; return cache; } struct PtxVersionCacheTag {}; struct SmVersionCacheTag {}; /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). * * \note This function may cache the result internally. * * \note This function is thread safe. */ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device) { auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. [=] (int& pv) { return PtxVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) ptx_version = payload.attribute; return payload.error; } /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_HOST, ( auto const device = CurrentDevice(); auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. [=](int &pv) { return PtxVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { ptx_version = payload.attribute; } result = payload.error; ), ( // NV_IS_DEVICE: result = PtxVersionUncached(ptx_version); )); return result; } /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) { cudaError_t error = cudaSuccess; do { int major = 0, minor = 0; if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break; if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break; sm_version = major * 100 + minor * 10; } while (0); return error; } /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version, int device = CurrentDevice()) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_HOST, ( auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in // the payload, which we check with `CubDebug` below. [=](int &pv) { return SmVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { sm_version = payload.attribute; }; result = payload.error; ), ( // NV_IS_DEVICE result = SmVersionUncached(sm_version, device); )); return result; } /** * Synchronize the specified \p stream. */ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { cudaError_t result = cudaErrorNotSupported; NV_IF_TARGET(NV_IS_HOST, (result = CubDebug(cudaStreamSynchronize(stream));), ((void)stream; result = CubDebug(cub::detail::device_synchronize());)); return result; } namespace detail { /** * Same as SyncStream, but intended for use with the debug_synchronous flags * in device algorithms. This should not be used if synchronization is required * for correctness. * * If `debug_synchronous` is false, this function will immediately return * cudaSuccess. If true, one of the following will occur: * * If synchronization is supported by the current compilation target and * settings, the sync is performed and the sync result is returned. * * If syncs are not supported then no sync is performed, but a message is logged * via _CubLog and cudaSuccess is returned. */ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) { #ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC (void)stream; return cudaSuccess; #else // CUB_DETAIL_DEBUG_ENABLE_SYNC: #define CUB_TMP_SYNC_AVAILABLE \ _CubLog("%s\n", "Synchronizing..."); \ return SyncStream(stream) #define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \ (void)stream; \ _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \ "device-side sync requires // or equivalently * * template * __global__ void ExampleKernel() * { * // Allocate shared memory for BlockScan * __shared__ volatile T buffer[4096]; * * ... * } * * ... * * // Determine SM occupancy for ExampleKernel specialized for unsigned char * int max_sm_occupancy; * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); * * // max_sm_occupancy <-- 4 on SM10 * // max_sm_occupancy <-- 8 on SM20 * // max_sm_occupancy <-- 12 on SM35 * * \endcode * */ template CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy( int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy int block_threads, ///< [in] Number of threads per thread block int dynamic_smem_bytes = 0) ///< [in] Dynamically allocated shared memory in bytes. Default is 0. { return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes)); } /****************************************************************************** * Policy management ******************************************************************************/ /** * Kernel dispatch configuration */ struct KernelConfig { int block_threads; int items_per_thread; int tile_size; int sm_occupancy; CUB_RUNTIME_FUNCTION __forceinline__ KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Init(KernelPtrT kernel_ptr) { block_threads = AgentPolicyT::BLOCK_THREADS; items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; tile_size = block_threads * items_per_thread; cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); return retval; } }; /// Helper for dispatching into a policy chain template struct ChainedPolicy { /// The policy for the active compiler pass using ActivePolicy = cub::detail::conditional_t<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>; /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Invoke(int ptx_version, FunctorT& op) { if (ptx_version < PTX_VERSION) { return PrevPolicyT::Invoke(ptx_version, op); } return op.template Invoke(); } }; /// Helper for dispatching into a policy chain (end-of-chain specialization) template struct ChainedPolicy { /// The policy for the active compiler pass typedef PolicyT ActivePolicy; /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) { return op.template Invoke(); } }; /** @} */ // end group UtilMgmt CUB_NAMESPACE_END cub-2.0.1/cub/util_macro.cuh000066400000000000000000000102631434614775400157140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Common C/C++ macro utilities ******************************************************************************/ #pragma once #include "util_namespace.cuh" #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilModule * @{ */ #ifndef CUB_ALIGN #if defined(_WIN32) || defined(_WIN64) /// Align struct #define CUB_ALIGN(bytes) __declspec(align(32)) #else /// Align struct #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) #endif #endif #define CUB_PREVENT_MACRO_SUBSTITUTION template constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, U &&u) -> decltype(t < u ? std::forward(t) : std::forward(u)) { return t < u ? std::forward(t) : std::forward(u); } template constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, U &&u) -> decltype(t < u ? std::forward(u) : std::forward(t)) { return t < u ? std::forward(u) : std::forward(t); } #ifndef CUB_MAX /// Select maximum(a, b) #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) #endif #ifndef CUB_MIN /// Select minimum(a, b) #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) #endif #ifndef CUB_QUOTIENT_FLOOR /// Quotient of x/y rounded down to nearest integer #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) #endif #ifndef CUB_QUOTIENT_CEILING /// Quotient of x/y rounded up to nearest integer #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) #endif #ifndef CUB_ROUND_UP_NEAREST /// x rounded up to the nearest multiple of y #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) #endif #ifndef CUB_ROUND_DOWN_NEAREST /// x rounded down to the nearest multiple of y #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) #endif #ifndef CUB_STATIC_ASSERT #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document #define CUB_CAT_(a, b) a ## b #define CUB_CAT(a, b) CUB_CAT_(a, b) #endif // DOXYGEN_SHOULD_SKIP_THIS /// Static assert #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] #endif /** @} */ // end group UtilModule CUB_NAMESPACE_END cub-2.0.1/cub/util_math.cuh000066400000000000000000000105741434614775400155510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Define helper math functions. */ #pragma once #include #include "util_namespace.cuh" #include "util_macro.cuh" CUB_NAMESPACE_BEGIN namespace detail { template using is_integral_or_enum = std::integral_constant::value || std::is_enum::value>; __host__ __device__ __forceinline__ constexpr std::size_t VshmemSize(std::size_t max_shmem, std::size_t shmem_per_block, std::size_t num_blocks) { return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0; } } /** * Divide n by d, round up if any remainder, and return the result. * * Effectively performs `(n + d - 1) / d`, but is robust against the case where * `(n + d - 1)` would overflow. */ template __host__ __device__ __forceinline__ constexpr NumeratorT DivideAndRoundUp(NumeratorT n, DenominatorT d) { static_assert(cub::detail::is_integral_or_enum::value && cub::detail::is_integral_or_enum::value, "DivideAndRoundUp is only intended for integral types."); // Static cast to undo integral promotion. return static_cast(n / d + (n % d != 0 ? 1 : 0)); } constexpr __device__ __host__ int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) { return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes)); } template constexpr __device__ __host__ int Nominal4BItemsToItems(int nominal_4b_items_per_thread) { return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast(sizeof(T)))); } template constexpr __device__ __host__ int Nominal8BItemsToItems(int nominal_8b_items_per_thread) { return sizeof(ItemT) <= 8u ? nominal_8b_items_per_thread : (cub::min)(nominal_8b_items_per_thread, (cub::max)(1, ((nominal_8b_items_per_thread * 8) + static_cast(sizeof(ItemT)) - 1) / static_cast(sizeof(ItemT)))); } /** * \brief Computes the midpoint of the integers * * Extra operation is performed in order to prevent overflow. * * \return Half the sum of \p begin and \p end */ template constexpr __device__ __host__ T MidPoint(T begin, T end) { return begin + (end - begin) / 2; } CUB_NAMESPACE_END cub-2.0.1/cub/util_namespace.cuh000066400000000000000000000131061434614775400165460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file util_namespace.cuh * \brief Utilities that allow `cub::` to be placed inside an * application-specific namespace. */ #pragma once // This is not used by this file; this is a hack so that we can detect the // CUB version from Thrust on older versions of CUB that did not have // version.cuh. #include "version.cuh" // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users // that they must now define the qualifier macro, too. #if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER) #error CUB requires a definition of CUB_NS_QUALIFIER when CUB_NS_PREFIX/POSTFIX are defined. #endif /** * \def THRUST_CUB_WRAPPED_NAMESPACE * If defined, this value will be used as the name of a namespace that wraps the * `thrust::` and `cub::` namespaces. * This macro should not be used with any other CUB namespace macros. */ #ifdef THRUST_CUB_WRAPPED_NAMESPACE #define CUB_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE #endif /** * \def CUB_WRAPPED_NAMESPACE * If defined, this value will be used as the name of a namespace that wraps the * `cub::` namespace. * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value. * This macro should not be used with any other CUB namespace macros. */ #ifdef CUB_WRAPPED_NAMESPACE #define CUB_NS_PREFIX \ namespace CUB_WRAPPED_NAMESPACE \ { #define CUB_NS_POSTFIX } #define CUB_NS_QUALIFIER ::CUB_WRAPPED_NAMESPACE::cub #endif /** * \def CUB_NS_PREFIX * This macro is inserted prior to all `namespace cub { ... }` blocks. It is * derived from CUB_WRAPPED_NAMESPACE, if set, and will be empty otherwise. * It may be defined by users, in which case CUB_NS_PREFIX, * CUB_NS_POSTFIX, and CUB_NS_QUALIFIER must all be set consistently. */ #ifndef CUB_NS_PREFIX #define CUB_NS_PREFIX #endif /** * \def CUB_NS_POSTFIX * This macro is inserted following the closing braces of all * `namespace cub { ... }` block. It is defined appropriately when * CUB_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and * CUB_NS_QUALIFIER must all be set consistently. */ #ifndef CUB_NS_POSTFIX #define CUB_NS_POSTFIX #endif /** * \def CUB_NS_QUALIFIER * This macro is used to qualify members of cub:: when accessing them from * outside of their namespace. By default, this is just `::cub`, and will be * set appropriately when CUB_WRAPPED_NAMESPACE is defined. This macro may be * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and * CUB_NS_QUALIFIER must all be set consistently. */ #ifndef CUB_NS_QUALIFIER #define CUB_NS_QUALIFIER ::cub #endif /** * \def CUB_NAMESPACE_BEGIN * This macro is used to open a `cub::` namespace block, along with any * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc. * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_BEGIN \ CUB_NS_PREFIX \ namespace cub \ { /** * \def CUB_NAMESPACE_END * This macro is used to close a `cub::` namespace block, along with any * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc. * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX // Declare these namespaces here for the purpose of Doxygenating them CUB_NS_PREFIX /*! \namespace cub * \brief \p cub is the top-level namespace which contains all CUB * functions and types. */ namespace cub { } CUB_NS_POSTFIX cub-2.0.1/cub/util_ptx.cuh000066400000000000000000000551171434614775400154350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * PTX intrinsics */ #pragma once #include "util_type.cuh" #include "util_arch.cuh" #include "util_namespace.cuh" #include "util_debug.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup UtilPtx * @{ */ /****************************************************************************** * PTX helper macros ******************************************************************************/ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Register modifier for pointer-types (for inlining PTX assembly) */ #if defined(_WIN64) || defined(__LP64__) #define __CUB_LP64__ 1 // 64-bit register modifier for inlined asm #define _CUB_ASM_PTR_ "l" #define _CUB_ASM_PTR_SIZE_ "u64" #else #define __CUB_LP64__ 0 // 32-bit register modifier for inlined asm #define _CUB_ASM_PTR_ "r" #define _CUB_ASM_PTR_SIZE_ "u32" #endif #endif // DOXYGEN_SHOULD_SKIP_THIS /****************************************************************************** * Inlined PTX intrinsics ******************************************************************************/ /** * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. */ __device__ __forceinline__ unsigned int SHR_ADD( unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); return ret; } /** * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. */ __device__ __forceinline__ unsigned int SHL_ADD( unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); return ret; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Bitfield-extract. */ template __device__ __forceinline__ unsigned int BFE( UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type /*byte_len*/) { unsigned int bits; asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); return bits; } /** * Bitfield-extract for 64-bit types. */ template __device__ __forceinline__ unsigned int BFE( UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type<8> /*byte_len*/) { const unsigned long long MASK = (1ull << num_bits) - 1; return (source >> bit_start) & MASK; } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. */ template __device__ __forceinline__ unsigned int BFE( UnsignedBits source, unsigned int bit_start, unsigned int num_bits) { return BFE(source, bit_start, num_bits, Int2Type()); } /** * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. */ __device__ __forceinline__ void BFI( unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) { asm ("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); } /** * \brief Three-operand add. Returns \p x + \p y + \p z. */ __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); return x; } /** * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. * * \par * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} * * \par Snippet * The code snippet below illustrates byte-permute. * \par * \code * #include * * __global__ void ExampleKernel(...) * { * int a = 0x03020100; * int b = 0x07060504; * int index = 0x00007531; * * int selected = PRMT(a, b, index); // 0x07050301 * * \endcode * */ __device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) { int ret; asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); return ret; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Sync-threads barrier. */ __device__ __forceinline__ void BAR(int count) { asm volatile("bar.sync 1, %0;" : : "r"(count)); } /** * CTA barrier */ __device__ __forceinline__ void CTA_SYNC() { __syncthreads(); } /** * CTA barrier with predicate */ __device__ __forceinline__ int CTA_SYNC_AND(int p) { return __syncthreads_and(p); } /** * CTA barrier with predicate */ __device__ __forceinline__ int CTA_SYNC_OR(int p) { return __syncthreads_or(p); } /** * Warp barrier */ __device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS __syncwarp(member_mask); #endif } /** * Warp any */ __device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS return __any_sync(member_mask, predicate); #else return ::__any(predicate); #endif } /** * Warp any */ __device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS return __all_sync(member_mask, predicate); #else return ::__all(predicate); #endif } /** * Warp ballot */ __device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS return __ballot_sync(member_mask, predicate); #else return __ballot(predicate); #endif } /** * Warp synchronous shfl_up */ __device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); #else asm volatile("shfl.up.b32 %0, %1, %2, %3;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); #endif return word; } /** * Warp synchronous shfl_down */ __device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); #else asm volatile("shfl.down.b32 %0, %1, %2, %3;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); #endif return word; } /** * Warp synchronous shfl_idx */ __device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); #else asm volatile("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); #endif return word; } /** * Warp synchronous shfl_idx */ __device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) { #ifdef CUB_USE_COOPERATIVE_GROUPS return __shfl_sync(member_mask, word, src_lane); #else return __shfl(word, src_lane); #endif } /** * Floating point multiply. (Mantissa LSB rounds towards zero.) */ __device__ __forceinline__ float FMUL_RZ(float a, float b) { float d; asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); return d; } /** * Floating point multiply-add. (Mantissa LSB rounds towards zero.) */ __device__ __forceinline__ float FFMA_RZ(float a, float b, float c) { float d; asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); return d; } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Terminates the calling thread */ __device__ __forceinline__ void ThreadExit() { asm volatile("exit;"); } /** * \brief Abort execution and generate an interrupt to the host CPU */ __device__ __forceinline__ void ThreadTrap() { asm volatile("trap;"); } /** * \brief Returns the row-major linear thread identifier for a multidimensional thread block */ __device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) { return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + threadIdx.x; } /** * \brief Returns the warp lane ID of the calling thread */ __device__ __forceinline__ unsigned int LaneId() { unsigned int ret; asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); return ret; } /** * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. */ __device__ __forceinline__ unsigned int WarpId() { unsigned int ret; asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); return ret; } /** * @brief Returns the warp mask for a warp of @p LOGICAL_WARP_THREADS threads * * @par * If the number of threads assigned to the virtual warp is not a power of two, * it's assumed that only one virtual warp exists. * * @tparam LOGICAL_WARP_THREADS [optional] The number of threads per * "logical" warp (may be less than the number of * hardware warp threads). * @param warp_id Id of virtual warp within architectural warp */ template __host__ __device__ __forceinline__ unsigned int WarpMask(unsigned int warp_id) { constexpr bool is_pow_of_two = PowerOfTwo::VALUE; constexpr bool is_arch_warp = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); unsigned int member_mask = 0xFFFFFFFFu >> (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS); if (is_pow_of_two && !is_arch_warp) { member_mask <<= warp_id * LOGICAL_WARP_THREADS; } return member_mask; } /** * \brief Returns the warp lane mask of all lanes less than the calling thread */ __device__ __forceinline__ unsigned int LaneMaskLt() { unsigned int ret; asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); return ret; } /** * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread */ __device__ __forceinline__ unsigned int LaneMaskLe() { unsigned int ret; asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); return ret; } /** * \brief Returns the warp lane mask of all lanes greater than the calling thread */ __device__ __forceinline__ unsigned int LaneMaskGt() { unsigned int ret; asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); return ret; } /** * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread */ __device__ __forceinline__ unsigned int LaneMaskGe() { unsigned int ret; asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); return ret; } /** @} */ // end group UtilPtx /** * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) * \ingroup WarpModule * * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. * \tparam T [inferred] The input/output element type * * \par * - Available only for SM3.0 or newer * * \par Snippet * The code snippet below illustrates each thread obtaining a \p double value from the * predecessor of its predecessor. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Obtain one input item per thread * double thread_data = ... * * // Obtain item from two ranks below * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); * * \endcode * \par * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. * */ template < int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp typename T> __device__ __forceinline__ T ShuffleUp( T input, ///< [in] The value to broadcast int src_offset, ///< [in] The relative down-offset of the peer to read from int first_thread, ///< [in] Index of first lane in logical warp (typically 0) unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 }; typedef typename UnitWord::ShuffleWord ShuffleWord; const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; ShuffleWord *output_alias = reinterpret_cast(&output); ShuffleWord *input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask); output_alias[0] = shuffle_word; #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); output_alias[WORD] = shuffle_word; } return output; } /** * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) * \ingroup WarpModule * * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. * \tparam T [inferred] The input/output element type * * \par * - Available only for SM3.0 or newer * * \par Snippet * The code snippet below illustrates each thread obtaining a \p double value from the * successor of its successor. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Obtain one input item per thread * double thread_data = ... * * // Obtain item from two ranks below * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); * * \endcode * \par * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. * */ template < int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp typename T> __device__ __forceinline__ T ShuffleDown( T input, ///< [in] The value to broadcast int src_offset, ///< [in] The relative up-offset of the peer to read from int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 }; typedef typename UnitWord::ShuffleWord ShuffleWord; const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; ShuffleWord *output_alias = reinterpret_cast(&output); ShuffleWord *input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask); output_alias[0] = shuffle_word; #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); output_alias[WORD] = shuffle_word; } return output; } /** * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) * * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. * \tparam T [inferred] The input/output element type * * \ingroup WarpModule * * \par * - Available only for SM3.0 or newer * * \par Snippet * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. * * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Obtain one input item per thread * double thread_data = ... * * // Obtain item from thread 0 * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); * * \endcode * \par * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. * */ template < int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp typename T> __device__ __forceinline__ T ShuffleIndex( T input, ///< [in] The value to broadcast int src_lane, ///< [in] Which warp lane is to do the broadcasting unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) }; typedef typename UnitWord::ShuffleWord ShuffleWord; const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; ShuffleWord *output_alias = reinterpret_cast(&output); ShuffleWord *input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], src_lane, SHFL_C, member_mask); output_alias[0] = shuffle_word; #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], src_lane, SHFL_C, member_mask); output_alias[WORD] = shuffle_word; } return output; } /** * Compute a 32b mask of threads having the same least-significant * LABEL_BITS of \p label as the calling thread. */ template inline __device__ unsigned int MatchAny(unsigned int label) { unsigned int retval; // Extract masks of common threads for each bit #pragma unroll for (int BIT = 0; BIT < LABEL_BITS; ++BIT) { unsigned int mask; unsigned int current_bit = 1 << BIT; asm ("{\n" " .reg .pred p;\n" " and.b32 %0, %1, %2;" " setp.eq.u32 p, %0, %2;\n" #ifdef CUB_USE_COOPERATIVE_GROUPS " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" #else " vote.ballot.b32 %0, p;\n" #endif " @!p not.b32 %0, %0;\n" "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); // Remove peers who differ retval = (BIT == 0) ? mask : retval & mask; } return retval; // // VOLTA match // unsigned int retval; // asm ("{\n" // " match.any.sync.b32 %0, %1, 0xffffffff;\n" // "}\n" : "=r"(retval) : "r"(label)); // return retval; } CUB_NAMESPACE_END cub-2.0.1/cub/util_type.cuh000066400000000000000000001255241434614775400156030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Common type manipulation (metaprogramming) utilities */ #pragma once #include #include #include #include #include #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA #include #endif #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA && \ !defined(CUB_DISABLE_BF16_SUPPORT) #include #endif #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilModule * @{ */ /****************************************************************************** * Conditional types ******************************************************************************/ namespace detail { template using conditional_t = typename std::conditional::type; template using value_t = typename std::iterator_traits::value_type; /** * The output value type * type = (if IteratorT's value type is void) ? * ... then the FallbackT, * ... else the IteratorT's value type */ template using non_void_value_t = cub::detail::conditional_t, void>::value, FallbackT, value_t>; } // namespace detail /** * \brief Type selection (IF ? ThenType : ElseType) * * \deprecated [Since 1.16.0] The cub::If APIs are deprecated. * Use cub::detail::conditional_t instead. */ template struct CUB_DEPRECATED If { using Type = cub::detail::conditional_t; }; /****************************************************************************** * Type equality ******************************************************************************/ /** * \brief Type equality test * * \deprecated [Since 1.16.0] The cub::Equals APIs are deprecated. * Use std::is_same instead. */ template struct CUB_DEPRECATED Equals { static constexpr int VALUE = std::is_same::value ? 1 : 0; static constexpr int NEGATE = VALUE ? 0 : 1; }; /****************************************************************************** * Static math ******************************************************************************/ /** * \brief Statically determine log2(N), rounded up. * * For example: * Log2<8>::VALUE // 3 * Log2<3>::VALUE // 2 */ template struct Log2 { /// Static logarithm value enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template struct Log2 { enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case COUNT : COUNT - 1 }; }; #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Statically determine if N is a power-of-two */ template struct PowerOfTwo { enum { VALUE = ((N & (N - 1)) == 0) }; }; /****************************************************************************** * Pointer vs. iterator detection ******************************************************************************/ /** * \brief Pointer vs. iterator * * \deprecated [Since 1.16.0] The cub::IsPointer APIs are deprecated. * Use std::is_pointer instead. */ template struct CUB_DEPRECATED IsPointer { static constexpr int VALUE = std::is_pointer::value; }; /****************************************************************************** * Qualifier detection ******************************************************************************/ /** * \brief Volatile modifier test * * \deprecated [Since 1.16.0] The cub::IsVolatile APIs are deprecated. * Use std::is_volatile instead. */ template struct CUB_DEPRECATED IsVolatile { static constexpr int VALUE = std::is_volatile::value; }; /****************************************************************************** * Qualifier removal ******************************************************************************/ /** * \brief Removes \p const and \p volatile qualifiers from type \p Tp. * * \deprecated [Since 1.16.0] The cub::RemoveQualifiers APIs are deprecated. * Use std::remove_cv instead. * * For example: * typename RemoveQualifiers::Type // int; */ template struct CUB_DEPRECATED RemoveQualifiers { using Type = typename std::remove_cv::type; }; /****************************************************************************** * Marker types ******************************************************************************/ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \brief A simple "NULL" marker type */ struct NullType { using value_type = NullType; template __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } }; /** * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) */ template struct Int2Type { enum {VALUE = A}; }; /** * \brief Allows algorithms that take a value as input to take a future value that is not computed yet at launch time. * * Note that it is user's responsibility to ensure that the result will be ready before use via external synchronization * or stream-ordering dependencies. * * \code * int *d_intermediate_result; * allocator.DeviceAllocate((void **)&d_intermediate_result, sizeof(int)); * compute_intermediate_result<<>>( * d_intermediate_result, // output * arg1, // input * arg2); // input * cub::FutureValue init_value(d_intermediate_result); * cub::DeviceScan::ExclusiveScan( * d_temp_storage, * temp_storage_bytes, * d_in, * d_out, * cub::Sum(), * init_value, * num_items); * allocator.DeviceFree(d_intermediate_result); * \endcode */ template struct FutureValue { using value_type = T; using iterator_type = IterT; explicit __host__ __device__ __forceinline__ FutureValue(IterT iter):m_iter(iter) {} __host__ __device__ __forceinline__ operator T() { return *m_iter; } private: IterT m_iter; }; namespace detail { /** * \brief Allows algorithms to instantiate a single kernel to support both immediate value and future value. */ template struct InputValue { using value_type = T; using iterator_type = IterT; __host__ __device__ __forceinline__ operator T() { if (m_is_future) { return m_future_value; } return m_immediate_value; } explicit __host__ __device__ __forceinline__ InputValue(T immediate_value): m_is_future(false), m_immediate_value(immediate_value) {} explicit __host__ __device__ __forceinline__ InputValue(FutureValue future_value): m_is_future(true), m_future_value(future_value) {} __host__ __device__ __forceinline__ InputValue(const InputValue &other): m_is_future(other.m_is_future) { if (m_is_future) { m_future_value = other.m_future_value; } else { detail::uninitialized_copy(&m_immediate_value, other.m_immediate_value); } } private: bool m_is_future; union { FutureValue m_future_value; T m_immediate_value; }; }; } // namespace detail /****************************************************************************** * Size and alignment ******************************************************************************/ /// Structure alignment template struct AlignBytes { struct Pad { T val; char byte; }; enum { /// The "true CUDA" alignment of T in bytes ALIGN_BYTES = sizeof(Pad) - sizeof(T) }; /// The "truly aligned" type typedef T Type; }; // Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree // with device C++ compilers (EDG) on types passed as template parameters through // kernel functions #define __CUB_ALIGN_BYTES(t, b) \ template <> struct AlignBytes \ { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; __CUB_ALIGN_BYTES(short4, 8) __CUB_ALIGN_BYTES(ushort4, 8) __CUB_ALIGN_BYTES(int2, 8) __CUB_ALIGN_BYTES(uint2, 8) __CUB_ALIGN_BYTES(long long, 8) __CUB_ALIGN_BYTES(unsigned long long, 8) __CUB_ALIGN_BYTES(float2, 8) __CUB_ALIGN_BYTES(double, 8) #ifdef _WIN32 __CUB_ALIGN_BYTES(long2, 8) __CUB_ALIGN_BYTES(ulong2, 8) #else __CUB_ALIGN_BYTES(long2, 16) __CUB_ALIGN_BYTES(ulong2, 16) #endif __CUB_ALIGN_BYTES(int4, 16) __CUB_ALIGN_BYTES(uint4, 16) __CUB_ALIGN_BYTES(float4, 16) __CUB_ALIGN_BYTES(long4, 16) __CUB_ALIGN_BYTES(ulong4, 16) __CUB_ALIGN_BYTES(longlong2, 16) __CUB_ALIGN_BYTES(ulonglong2, 16) __CUB_ALIGN_BYTES(double2, 16) __CUB_ALIGN_BYTES(longlong4, 16) __CUB_ALIGN_BYTES(ulonglong4, 16) __CUB_ALIGN_BYTES(double4, 16) // clang-format off template struct AlignBytes : AlignBytes {}; template struct AlignBytes : AlignBytes {}; template struct AlignBytes : AlignBytes {}; // clang-format on /// Unit-words of data movement template struct UnitWord { enum { ALIGN_BYTES = AlignBytes::ALIGN_BYTES }; template struct IsMultiple { enum { UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0) }; }; /// Biggest shuffle word that T is a whole multiple of and is not larger than /// the alignment of T using ShuffleWord = cub::detail::conditional_t< IsMultiple::IS_MULTIPLE, unsigned int, cub::detail::conditional_t::IS_MULTIPLE, unsigned short, unsigned char>>; /// Biggest volatile word that T is a whole multiple of and is not larger than /// the alignment of T using VolatileWord = cub::detail::conditional_t::IS_MULTIPLE, unsigned long long, ShuffleWord>; /// Biggest memory-access word that T is a whole multiple of and is not larger /// than the alignment of T using DeviceWord = cub::detail::conditional_t::IS_MULTIPLE, ulonglong2, VolatileWord>; /// Biggest texture reference word that T is a whole multiple of and is not /// larger than the alignment of T using TextureWord = cub::detail::conditional_t< IsMultiple::IS_MULTIPLE, uint4, cub::detail::conditional_t::IS_MULTIPLE, uint2, ShuffleWord>>; }; // float2 specialization workaround (for SM10-SM13) template <> struct UnitWord { typedef int ShuffleWord; typedef unsigned long long VolatileWord; typedef unsigned long long DeviceWord; typedef float2 TextureWord; }; // float4 specialization workaround (for SM10-SM13) template <> struct UnitWord { typedef int ShuffleWord; typedef unsigned long long VolatileWord; typedef ulonglong2 DeviceWord; typedef float4 TextureWord; }; // char2 specialization workaround (for SM10-SM13) template <> struct UnitWord { typedef unsigned short ShuffleWord; typedef unsigned short VolatileWord; typedef unsigned short DeviceWord; typedef unsigned short TextureWord; }; // clang-format off template struct UnitWord : UnitWord {}; template struct UnitWord : UnitWord {}; template struct UnitWord : UnitWord {}; // clang-format on /****************************************************************************** * Vector type inference utilities. ******************************************************************************/ /** * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. */ template struct CubVector; enum { /// The maximum number of elements in CUDA vector types MAX_VEC_ELEMENTS = 4, }; /** * Generic vector-1 type */ template struct CubVector { T x; typedef T BaseType; typedef CubVector Type; }; /** * Generic vector-2 type */ template struct CubVector { T x; T y; typedef T BaseType; typedef CubVector Type; }; /** * Generic vector-3 type */ template struct CubVector { T x; T y; T z; typedef T BaseType; typedef CubVector Type; }; /** * Generic vector-4 type */ template struct CubVector { T x; T y; T z; T w; typedef T BaseType; typedef CubVector Type; }; /** * Macro for expanding partially-specialized built-in vector types */ #define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ \ template<> struct CubVector : short_type##1 \ { \ typedef base_type BaseType; \ typedef short_type##1 Type; \ __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ CubVector retval; \ retval.x = x + other.x; \ return retval; \ } \ __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ CubVector retval; \ retval.x = x - other.x; \ return retval; \ } \ }; \ \ template<> struct CubVector : short_type##2 \ { \ typedef base_type BaseType; \ typedef short_type##2 Type; \ __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ CubVector retval; \ retval.x = x + other.x; \ retval.y = y + other.y; \ return retval; \ } \ __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ CubVector retval; \ retval.x = x - other.x; \ retval.y = y - other.y; \ return retval; \ } \ }; \ \ template<> struct CubVector : short_type##3 \ { \ typedef base_type BaseType; \ typedef short_type##3 Type; \ __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ CubVector retval; \ retval.x = x + other.x; \ retval.y = y + other.y; \ retval.z = z + other.z; \ return retval; \ } \ __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ CubVector retval; \ retval.x = x - other.x; \ retval.y = y - other.y; \ retval.z = z - other.z; \ return retval; \ } \ }; \ \ template<> struct CubVector : short_type##4 \ { \ typedef base_type BaseType; \ typedef short_type##4 Type; \ __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ CubVector retval; \ retval.x = x + other.x; \ retval.y = y + other.y; \ retval.z = z + other.z; \ retval.w = w + other.w; \ return retval; \ } \ __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ CubVector retval; \ retval.x = x - other.x; \ retval.y = y - other.y; \ retval.z = z - other.z; \ retval.w = w - other.w; \ return retval; \ } \ }; // Expand CUDA vector types for built-in primitives // clang-format off CUB_DEFINE_VECTOR_TYPE(char, char) CUB_DEFINE_VECTOR_TYPE(signed char, char) CUB_DEFINE_VECTOR_TYPE(short, short) CUB_DEFINE_VECTOR_TYPE(int, int) CUB_DEFINE_VECTOR_TYPE(long, long) CUB_DEFINE_VECTOR_TYPE(long long, longlong) CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) CUB_DEFINE_VECTOR_TYPE(float, float) CUB_DEFINE_VECTOR_TYPE(double, double) CUB_DEFINE_VECTOR_TYPE(bool, uchar) // clang-format on // Undefine macros #undef CUB_DEFINE_VECTOR_TYPE /****************************************************************************** * Wrapper types ******************************************************************************/ /** * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions */ template struct Uninitialized { /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T typedef typename UnitWord::DeviceWord DeviceWord; static constexpr std::size_t DATA_SIZE = sizeof(T); static constexpr std::size_t WORD_SIZE = sizeof(DeviceWord); static constexpr std::size_t WORDS = DATA_SIZE / WORD_SIZE; /// Backing storage DeviceWord storage[WORDS]; /// Alias __host__ __device__ __forceinline__ T& Alias() { return reinterpret_cast(*this); } }; /** * \brief A key identifier paired with a corresponding value */ template < typename _Key, typename _Value #if defined(_WIN32) && !defined(_WIN64) , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) #endif // #if defined(_WIN32) && !defined(_WIN64) > struct KeyValuePair { typedef _Key Key; ///< Key data type typedef _Value Value; ///< Value data type Key key; ///< Item key Value value; ///< Item value /// Constructor __host__ __device__ __forceinline__ KeyValuePair() {} /// Constructor __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} /// Inequality operator __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) { return (value != b.value) || (key != b.key); } }; #if defined(_WIN32) && !defined(_WIN64) /** * Win32 won't do 16B alignment. This can present two problems for * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: * 1) If a smaller-aligned item were to be listed first, the host compiler places the * should-be-16B item at too early an offset (and disagrees with device compiler) * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size * of the struct wrong (and disagrees with device compiler) * * So we put the larger-should-be-aligned item first, and explicitly pad the * end of the struct */ /// Smaller key specialization template struct KeyValuePair { typedef K Key; typedef V Value; typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; Value value; // Value has larger would-be alignment and goes first Key key; Pad pad; /// Constructor __host__ __device__ __forceinline__ KeyValuePair() {} /// Constructor __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} /// Inequality operator __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) { return (value != b.value) || (key != b.key); } }; /// Smaller value specialization template struct KeyValuePair { typedef K Key; typedef V Value; typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; Key key; // Key has larger would-be alignment and goes first Value value; Pad pad; /// Constructor __host__ __device__ __forceinline__ KeyValuePair() {} /// Constructor __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} /// Inequality operator __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) { return (value != b.value) || (key != b.key); } }; #endif // #if defined(_WIN32) && !defined(_WIN64) /** * \brief A wrapper for passing simple static arrays as kernel parameters */ template struct ArrayWrapper { /// Statically-sized array of type \p T T array[COUNT]; /// Constructor __host__ __device__ __forceinline__ ArrayWrapper() {} }; /** * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. * * Many multi-pass computations require a pair of "ping-pong" storage * buffers (e.g., one for reading from and the other for writing to, and then * vice-versa for the subsequent pass). This structure wraps a set of device * buffers and a "selector" member to track which is "current". */ template struct DoubleBuffer { /// Pair of device buffer pointers T *d_buffers[2]; /// Selector into \p d_buffers (i.e., the active/valid buffer) int selector; /// \brief Constructor __host__ __device__ __forceinline__ DoubleBuffer() { selector = 0; d_buffers[0] = NULL; d_buffers[1] = NULL; } /// \brief Constructor __host__ __device__ __forceinline__ DoubleBuffer( T *d_current, ///< The currently valid buffer T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current { selector = 0; d_buffers[0] = d_current; d_buffers[1] = d_alternate; } /// \brief Return pointer to the currently valid buffer __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } /// \brief Return pointer to the currently invalid buffer __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } }; /****************************************************************************** * Typedef-detection ******************************************************************************/ /** * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name */ #define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ template \ struct detector_name \ { \ template \ static char& test(typename C::nested_type_name*); \ template \ static int& test(...); \ enum \ { \ VALUE = sizeof(test(0)) < sizeof(int) \ }; \ }; /****************************************************************************** * Simple enable-if (similar to Boost) ******************************************************************************/ /** * \brief Simple enable-if (similar to Boost) * * \deprecated [Since 1.16.0] The cub::If APIs are deprecated. * Use std::enable_if instead. */ template struct CUB_DEPRECATED EnableIf { using Type = typename std::enable_if::type; }; /****************************************************************************** * Typedef-detection ******************************************************************************/ /** * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) */ template struct BinaryOpHasIdxParam { private: /* template struct SFINAE1 {}; template struct SFINAE2 {}; template struct SFINAE3 {}; template struct SFINAE4 {}; */ template struct SFINAE5 {}; template struct SFINAE6 {}; template struct SFINAE7 {}; template struct SFINAE8 {}; /* template static char Test(SFINAE1 *); template static char Test(SFINAE2 *); template static char Test(SFINAE3 *); template static char Test(SFINAE4 *); */ template __host__ __device__ static char Test(SFINAE5 *); template __host__ __device__ static char Test(SFINAE6 *); template __host__ __device__ static char Test(SFINAE7 *); template __host__ __device__ static char Test(SFINAE8 *); template static int Test(...); public: /// Whether the functor BinaryOp has a third unsigned int index param static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); }; /****************************************************************************** * Simple type traits utilities. * * For example: * Traits::CATEGORY // SIGNED_INTEGER * Traits::NULL_TYPE // true * Traits::CATEGORY // NOT_A_NUMBER * Traits::PRIMITIVE; // false * ******************************************************************************/ /** * \brief Basic type traits categories */ enum Category { NOT_A_NUMBER, SIGNED_INTEGER, UNSIGNED_INTEGER, FLOATING_POINT }; /** * \brief Basic type traits */ template struct BaseTraits { /// Category static const Category CATEGORY = _CATEGORY; enum { PRIMITIVE = _PRIMITIVE, NULL_TYPE = _NULL_TYPE, }; }; /** * Basic type traits (unsigned primitive specialization) */ template struct BaseTraits { typedef _UnsignedBits UnsignedBits; static const Category CATEGORY = UNSIGNED_INTEGER; static const UnsignedBits LOWEST_KEY = UnsignedBits(0); static const UnsignedBits MAX_KEY = UnsignedBits(-1); enum { PRIMITIVE = true, NULL_TYPE = false, }; static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) { return key; } static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) { return key; } static __host__ __device__ __forceinline__ T Max() { UnsignedBits retval_bits = MAX_KEY; T retval; memcpy(&retval, &retval_bits, sizeof(T)); return retval; } static __host__ __device__ __forceinline__ T Lowest() { UnsignedBits retval_bits = LOWEST_KEY; T retval; memcpy(&retval, &retval_bits, sizeof(T)); return retval; } }; /** * Basic type traits (signed primitive specialization) */ template struct BaseTraits { typedef _UnsignedBits UnsignedBits; static const Category CATEGORY = SIGNED_INTEGER; static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static const UnsignedBits LOWEST_KEY = HIGH_BIT; static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; enum { PRIMITIVE = true, NULL_TYPE = false, }; static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) { return key ^ HIGH_BIT; }; static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) { return key ^ HIGH_BIT; }; static __host__ __device__ __forceinline__ T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } static __host__ __device__ __forceinline__ T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } }; template struct FpLimits; template <> struct FpLimits { static __host__ __device__ __forceinline__ float Max() { return FLT_MAX; } static __host__ __device__ __forceinline__ float Lowest() { return FLT_MAX * float(-1); } }; template <> struct FpLimits { static __host__ __device__ __forceinline__ double Max() { return DBL_MAX; } static __host__ __device__ __forceinline__ double Lowest() { return DBL_MAX * double(-1); } }; #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA template <> struct FpLimits<__half> { static __host__ __device__ __forceinline__ __half Max() { unsigned short max_word = 0x7BFF; return reinterpret_cast<__half&>(max_word); } static __host__ __device__ __forceinline__ __half Lowest() { unsigned short lowest_word = 0xFBFF; return reinterpret_cast<__half&>(lowest_word); } }; #endif #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA && \ !defined(CUB_DISABLE_BF16_SUPPORT) template <> struct FpLimits<__nv_bfloat16> { static __host__ __device__ __forceinline__ __nv_bfloat16 Max() { unsigned short max_word = 0x7F7F; return reinterpret_cast<__nv_bfloat16&>(max_word); } static __host__ __device__ __forceinline__ __nv_bfloat16 Lowest() { unsigned short lowest_word = 0xFF7F; return reinterpret_cast<__nv_bfloat16&>(lowest_word); } }; #endif /** * Basic type traits (fp primitive specialization) */ template struct BaseTraits { typedef _UnsignedBits UnsignedBits; static const Category CATEGORY = FLOATING_POINT; static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; enum { PRIMITIVE = true, NULL_TYPE = false, }; static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) { UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; return key ^ mask; }; static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) { UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); return key ^ mask; }; static __host__ __device__ __forceinline__ T Max() { return FpLimits::Max(); } static __host__ __device__ __forceinline__ T Lowest() { return FpLimits::Lowest(); } }; /** * \brief Numeric type traits */ // clang-format off template struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA template <> struct NumericTraits<__half> : BaseTraits {}; #endif #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA && \ !defined(CUB_DISABLE_BF16_SUPPORT) template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; #endif template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; // clang-format on /** * \brief Type traits */ template struct Traits : NumericTraits::type> {}; #endif // DOXYGEN_SHOULD_SKIP_THIS /** @} */ // end group UtilModule CUB_NAMESPACE_END cub-2.0.1/cub/version.cuh000066400000000000000000000060541434614775400152460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /*! \file version.cuh * \brief Compile-time macros encoding CUB release version * * is the only CUB header that is guaranteed to * change with every CUB release. * */ #pragma once /*! \def CUB_VERSION * \brief The preprocessor macro \p CUB_VERSION encodes the version * number of the CUB library. * * CUB_VERSION % 100 is the sub-minor version. * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ #define CUB_VERSION 200001 /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the * major version number of the CUB library. */ #define CUB_MAJOR_VERSION (CUB_VERSION / 100000) /*! \def CUB_MINOR_VERSION * \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the * minor version number of the CUB library. */ #define CUB_MINOR_VERSION (CUB_VERSION / 100 % 1000) /*! \def CUB_SUBMINOR_VERSION * \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the * sub-minor version number of the CUB library. */ #define CUB_SUBMINOR_VERSION (CUB_VERSION % 100) /*! \def CUB_PATCH_NUMBER * \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the * patch number of the CUB library. */ #define CUB_PATCH_NUMBER 0 cub-2.0.1/cub/warp/000077500000000000000000000000001434614775400140245ustar00rootroot00000000000000cub-2.0.1/cub/warp/specializations/000077500000000000000000000000001434614775400172255ustar00rootroot00000000000000cub-2.0.1/cub/warp/specializations/warp_reduce_shfl.cuh000066400000000000000000000577221434614775400232570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. */ #pragma once #include "../../config.cuh" #include "../../thread/thread_operators.cuh" #include "../../util_ptx.cuh" #include "../../util_type.cuh" #include #include #include CUB_NAMESPACE_BEGIN /** * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. * * LOGICAL_WARP_THREADS must be a power-of-two */ template < typename T, ///< Data type being reduced int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpReduceShfl { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); //--------------------------------------------------------------------- // Constants and type definitions //--------------------------------------------------------------------- enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp reduction steps STEPS = Log2::VALUE, /// Number of logical warps in a PTX warp LOGICAL_WARPS = CUB_WARP_THREADS(0) / LOGICAL_WARP_THREADS, /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8 }; template struct IsInteger { enum { ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) }; }; /// Shared memory storage layout type typedef NullType TempStorage; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- /// Lane index in logical warp int lane_id; /// Logical warp index in 32-thread physical warp int warp_id; /// 32-thread physical warp member mask of logical warp uint32_t member_mask; //--------------------------------------------------------------------- // Construction //--------------------------------------------------------------------- /// Constructor __device__ __forceinline__ WarpReduceShfl( TempStorage &/*temp_storage*/) : lane_id(static_cast(LaneId())) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { if (!IS_ARCH_WARP) { lane_id = lane_id % LOGICAL_WARP_THREADS; } } //--------------------------------------------------------------------- // Reduction steps //--------------------------------------------------------------------- /// Reduction (specialized for summation across uint32 types) __device__ __forceinline__ unsigned int ReduceStep( unsigned int input, ///< [in] Calling thread's input item. cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { unsigned int output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 r0;" " .reg .pred p;" " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" " @p add.u32 r0, r0, %4;" " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .u32 r0;" " .reg .pred p;" " shfl.down.b32 r0|p, %1, %2, %3;" " @p add.u32 r0, r0, %4;" " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); #endif return output; } /// Reduction (specialized for summation across fp32 types) __device__ __forceinline__ float ReduceStep( float input, ///< [in] Calling thread's input item. cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { float output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .f32 r0;" " .reg .pred p;" " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" " @p add.f32 r0, r0, %4;" " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .f32 r0;" " .reg .pred p;" " shfl.down.b32 r0|p, %1, %2, %3;" " @p add.f32 r0, r0, %4;" " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); #endif return output; } /// Reduction (specialized for summation across unsigned long long types) __device__ __forceinline__ unsigned long long ReduceStep( unsigned long long input, ///< [in] Calling thread's input item. cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { unsigned long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" " mov.b64 %0, {lo, hi};" " @p add.u64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); #else asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.down.b32 lo|p, lo, %2, %3;" " shfl.down.b32 hi|p, hi, %2, %3;" " mov.b64 %0, {lo, hi};" " @p add.u64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); #endif return output; } /// Reduction (specialized for summation across long long types) __device__ __forceinline__ long long ReduceStep( long long input, ///< [in] Calling thread's input item. cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" " mov.b64 %0, {lo, hi};" " @p add.s64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); #else asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.down.b32 lo|p, lo, %2, %3;" " shfl.down.b32 hi|p, hi, %2, %3;" " mov.b64 %0, {lo, hi};" " @p add.s64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); #endif return output; } /// Reduction (specialized for summation across double types) __device__ __forceinline__ double ReduceStep( double input, ///< [in] Calling thread's input item. cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { double output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " .reg .f64 r0;" " mov.b64 %0, %1;" " mov.b64 {lo, hi}, %1;" " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" " mov.b64 r0, {lo, hi};" " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); #else asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " .reg .f64 r0;" " mov.b64 %0, %1;" " mov.b64 {lo, hi}, %1;" " shfl.down.b32 lo|p, lo, %2, %3;" " shfl.down.b32 hi|p, hi, %2, %3;" " mov.b64 r0, {lo, hi};" " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); #endif return output; } /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) template __device__ __forceinline__ KeyValuePair ReduceStep( KeyValuePair input, ///< [in] Calling thread's input item. SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { KeyValuePair output; KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); output.key = input.key; output.value = ReduceStep( input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); if (input.key != other_key) output.value = input.value; return output; } /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) template __device__ __forceinline__ KeyValuePair ReduceStep( KeyValuePair input, ///< [in] Calling thread's input item. SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { KeyValuePair output; output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); if (input.key > 0) output.value = input.value; return output; } /// Reduction step (generic) template __device__ __forceinline__ _T ReduceStep( _T input, ///< [in] Calling thread's input item. ReductionOp reduction_op, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset) ///< [in] Up-offset to pull from { _T output = input; _T temp = ShuffleDown(output, offset, last_lane, member_mask); // Perform reduction op if valid if (offset + lane_id <= last_lane) output = reduction_op(input, temp); return output; } /// Reduction step (specialized for small unsigned integers size 32b or less) template __device__ __forceinline__ _T ReduceStep( _T input, ///< [in] Calling thread's input item. ReductionOp reduction_op, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset, ///< [in] Up-offset to pull from Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer { return ReduceStep(input, reduction_op, last_lane, offset); } /// Reduction step (specialized for types other than small unsigned integers size 32b or less) template __device__ __forceinline__ _T ReduceStep( _T input, ///< [in] Calling thread's input item. ReductionOp reduction_op, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment int offset, ///< [in] Up-offset to pull from Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer { return ReduceStep(input, reduction_op, last_lane, offset); } //--------------------------------------------------------------------- // Templated inclusive scan iteration //--------------------------------------------------------------------- template __device__ __forceinline__ void ReduceStep( T& input, ///< [in] Calling thread's input item. ReductionOp reduction_op, ///< [in] Binary reduction operator int last_lane, ///< [in] Index of last lane in segment Int2Type /*step*/) { input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); ReduceStep(input, reduction_op, last_lane, Int2Type()); } template __device__ __forceinline__ void ReduceStep( T& /*input*/, ///< [in] Calling thread's input item. ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator int /*last_lane*/, ///< [in] Index of last lane in segment Int2Type /*step*/) {} //--------------------------------------------------------------------- // Reduction operations //--------------------------------------------------------------------- template __device__ __forceinline__ T ReduceImpl( Int2Type<0> /* all_lanes_valid */, T input, ///< [in] Calling thread's input int valid_items, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op) ///< [in] Binary reduction operator { int last_lane = valid_items - 1; T output = input; // Template-iterate reduction steps ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); return output; } template __device__ __forceinline__ T ReduceImpl( Int2Type<1> /* all_lanes_valid */, T input, ///< [in] Calling thread's input int /* valid_items */, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op) ///< [in] Binary reduction operator { int last_lane = LOGICAL_WARP_THREADS - 1; T output = input; // Template-iterate reduction steps ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); return output; } // Warp reduce functions are not supported by nvc++ (NVBug 3694682) #ifndef _NVHPC_CUDA template __device__ __forceinline__ typename std::enable_if< std::is_same::value || std::is_same::value, T>::type ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Sum /* reduction_op */) { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_add_sync(member_mask, input);), (output = ReduceImpl(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Sum{});)); return output; } template __device__ __forceinline__ typename std::enable_if< std::is_same::value || std::is_same::value, T>::type ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Min /* reduction_op */) { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_min_sync(member_mask, input);), (output = ReduceImpl(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Min{});)); return output; } template __device__ __forceinline__ typename std::enable_if< std::is_same::value || std::is_same::value, T>::type ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Max /* reduction_op */) { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_max_sync(member_mask, input);), (output = ReduceImpl(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Max{});)); return output; } #endif // _NVHPC_CUDA /// Reduction template < bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items typename ReductionOp> __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input int valid_items, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op) ///< [in] Binary reduction operator { return ReduceImpl( Int2Type{}, input, valid_items, reduction_op); } /// Segmented reduction template < bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail typename FlagT, typename ReductionOp> __device__ __forceinline__ T SegmentedReduce( T input, ///< [in] Calling thread's input FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail ReductionOp reduction_op) ///< [in] Binary reduction operator { // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); // Convert to tail-segmented if (HEAD_SEGMENTED) warp_flags >>= 1; // Mask out the bits below the current thread warp_flags &= LaneMaskGe(); // Mask of physical lanes outside the logical warp and convert to logical lanemask if (!IS_ARCH_WARP) { warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS); } // Mask in the last lane of logical warp warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); // Find the next set flag int last_lane = __clz(__brev(warp_flags)); T output = input; // // Iterate reduction steps // #pragma unroll // for (int STEP = 0; STEP < STEPS; STEP++) // { // output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); // } // Template-iterate reduction steps ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); return output; } }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/specializations/warp_reduce_smem.cuh000066400000000000000000000331431434614775400232530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. */ #pragma once #include "../../config.cuh" #include "../../thread/thread_operators.cuh" #include "../../thread/thread_load.cuh" #include "../../thread/thread_store.cuh" #include "../../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. */ template < typename T, ///< Data type being reduced int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpReduceSmem { /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, /// The number of warp scan steps STEPS = Log2::VALUE, /// The number of threads in half a warp HALF_WARP_THREADS = 1 << (STEPS - 1), /// The number of shared memory elements per warp WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, /// FlagT status (when not using ballot) UNSET = 0x0, // Is initially unset SET = 0x1, // Is initially set SEEN = 0x2, // Has seen another head flag from a successor peer }; /// Shared memory flag type typedef unsigned char SmemFlag; /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) struct _TempStorage { T reduce[WARP_SMEM_ELEMENTS]; SmemFlag flags[WARP_SMEM_ELEMENTS]; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /****************************************************************************** * Thread fields ******************************************************************************/ _TempStorage &temp_storage; unsigned int lane_id; unsigned int member_mask; /****************************************************************************** * Construction ******************************************************************************/ /// Constructor explicit __device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) , member_mask( WarpMask(LaneId() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** * Utility methods ******************************************************************************/ //--------------------------------------------------------------------- // Regular reduction //--------------------------------------------------------------------- /** * Reduction step */ template < bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items typename ReductionOp, int STEP> __device__ __forceinline__ T ReduceStep( T input, ///< [in] Calling thread's input int valid_items, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op, ///< [in] Reduction operator Int2Type /*step*/) { const int OFFSET = 1 << STEP; // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); WARP_SYNC(member_mask); // Update input if peer_addend is in range if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) { T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); input = reduction_op(input, peer_addend); } WARP_SYNC(member_mask); return ReduceStep(input, valid_items, reduction_op, Int2Type()); } /** * Reduction step (terminate) */ template < bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items typename ReductionOp> __device__ __forceinline__ T ReduceStep( T input, ///< [in] Calling thread's input int valid_items, ///< [in] Total number of valid items across the logical warp ReductionOp /*reduction_op*/, ///< [in] Reduction operator Int2Type /*step*/) { return input; } //--------------------------------------------------------------------- // Segmented reduction //--------------------------------------------------------------------- /** * Ballot-based segmented reduce */ template < bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail typename FlagT, typename ReductionOp> __device__ __forceinline__ T SegmentedReduce( T input, ///< [in] Calling thread's input FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail ReductionOp reduction_op, ///< [in] Reduction operator Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality { // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); if (!HEAD_SEGMENTED) warp_flags <<= 1; // Keep bits above the current thread. warp_flags &= LaneMaskGt(); // Accommodate packing of multiple logical warps in a single physical warp if (!IS_ARCH_WARP) { warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; } // Find next flag int next_flag = __clz(__brev(warp_flags)); // Clip the next segment at the warp boundary if necessary if (LOGICAL_WARP_THREADS != 32) next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { const int OFFSET = 1 << STEP; // Share input into buffer ThreadStore(&temp_storage.reduce[lane_id], input); WARP_SYNC(member_mask); // Update input if peer_addend is in range if (OFFSET + lane_id < next_flag) { T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); input = reduction_op(input, peer_addend); } WARP_SYNC(member_mask); } return input; } /** * Smem-based segmented reduce */ template < bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail typename FlagT, typename ReductionOp> __device__ __forceinline__ T SegmentedReduce( T input, ///< [in] Calling thread's input FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail ReductionOp reduction_op, ///< [in] Reduction operator Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality { enum { UNSET = 0x0, // Is initially unset SET = 0x1, // Is initially set SEEN = 0x2, // Has seen another head flag from a successor peer }; // Alias flags onto shared data storage volatile SmemFlag *flag_storage = temp_storage.flags; SmemFlag flag_status = (flag) ? SET : UNSET; for (int STEP = 0; STEP < STEPS; STEP++) { const int OFFSET = 1 << STEP; // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); WARP_SYNC(member_mask); // Get peer from buffer T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); WARP_SYNC(member_mask); // Share flag through buffer flag_storage[lane_id] = flag_status; // Get peer flag from buffer SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; // Update input if peer was in range if (lane_id < LOGICAL_WARP_THREADS - OFFSET) { if (HEAD_SEGMENTED) { // Head-segmented if ((flag_status & SEEN) == 0) { // Has not seen a more distant head flag if (peer_flag_status & SET) { // Has now seen a head flag flag_status |= SEEN; } else { // Peer is not a head flag: grab its count input = reduction_op(input, peer_addend); } // Update seen status to include that of peer flag_status |= (peer_flag_status & SEEN); } } else { // Tail-segmented. Simply propagate flag status if (!flag_status) { input = reduction_op(input, peer_addend); flag_status |= peer_flag_status; } } } } return input; } /****************************************************************************** * Interface ******************************************************************************/ /** * Reduction */ template < bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items typename ReductionOp> __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input int valid_items, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op) ///< [in] Reduction operator { return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); } /** * Segmented reduction */ template < bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail typename FlagT, typename ReductionOp> __device__ __forceinline__ T SegmentedReduce( T input, ///< [in] Calling thread's input FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail ReductionOp reduction_op) ///< [in] Reduction operator { return SegmentedReduce(input, flag, reduction_op, Int2Type()); } }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/specializations/warp_scan_shfl.cuh000066400000000000000000000607151434614775400227300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. */ #pragma once #include "../../config.cuh" #include "../../thread/thread_operators.cuh" #include "../../util_type.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN /** * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. * * LOGICAL_WARP_THREADS must be a power-of-two */ template < typename T, ///< Data type being scanned int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpScanShfl { //--------------------------------------------------------------------- // Constants and type definitions //--------------------------------------------------------------------- enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp scan steps STEPS = Log2::VALUE, /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8 }; template struct IntegerTraits { enum { ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) }; }; /// Shared memory storage layout type struct TempStorage {}; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- /// Lane index in logical warp unsigned int lane_id; /// Logical warp index in 32-thread physical warp unsigned int warp_id; /// 32-thread physical warp member mask of logical warp unsigned int member_mask; //--------------------------------------------------------------------- // Construction //--------------------------------------------------------------------- /// Constructor explicit __device__ __forceinline__ WarpScanShfl(TempStorage & /*temp_storage*/) : lane_id(LaneId()) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { if (!IS_ARCH_WARP) { lane_id = lane_id % LOGICAL_WARP_THREADS; } } //--------------------------------------------------------------------- // Inclusive scan steps //--------------------------------------------------------------------- /// Inclusive prefix scan step (specialized for summation across int32 types) __device__ __forceinline__ int InclusiveScanStep( int input, ///< [in] Calling thread's input item. cub::Sum /*scan_op*/, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { int output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .s32 r0;" " .reg .pred p;" " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" " @p add.s32 r0, r0, %4;" " mov.s32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .s32 r0;" " .reg .pred p;" " shfl.up.b32 r0|p, %1, %2, %3;" " @p add.s32 r0, r0, %4;" " mov.s32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); #endif return output; } /// Inclusive prefix scan step (specialized for summation across uint32 types) __device__ __forceinline__ unsigned int InclusiveScanStep( unsigned int input, ///< [in] Calling thread's input item. cub::Sum /*scan_op*/, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { unsigned int output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 r0;" " .reg .pred p;" " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" " @p add.u32 r0, r0, %4;" " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .u32 r0;" " .reg .pred p;" " shfl.up.b32 r0|p, %1, %2, %3;" " @p add.u32 r0, r0, %4;" " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); #endif return output; } /// Inclusive prefix scan step (specialized for summation across fp32 types) __device__ __forceinline__ float InclusiveScanStep( float input, ///< [in] Calling thread's input item. cub::Sum /*scan_op*/, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { float output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .f32 r0;" " .reg .pred p;" " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" " @p add.f32 r0, r0, %4;" " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .f32 r0;" " .reg .pred p;" " shfl.up.b32 r0|p, %1, %2, %3;" " @p add.f32 r0, r0, %4;" " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); #endif return output; } /// Inclusive prefix scan step (specialized for summation across unsigned long long types) __device__ __forceinline__ unsigned long long InclusiveScanStep( unsigned long long input, ///< [in] Calling thread's input item. cub::Sum /*scan_op*/, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { unsigned long long output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u64 r0;" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" " mov.b64 r0, {lo, hi};" " @p add.u64 r0, r0, %4;" " mov.u64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .u64 r0;" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.up.b32 lo|p, lo, %2, %3;" " shfl.up.b32 hi|p, hi, %2, %3;" " mov.b64 r0, {lo, hi};" " @p add.u64 r0, r0, %4;" " mov.u64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); #endif return output; } /// Inclusive prefix scan step (specialized for summation across long long types) __device__ __forceinline__ long long InclusiveScanStep( long long input, ///< [in] Calling thread's input item. cub::Sum /*scan_op*/, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { long long output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .s64 r0;" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" " mov.b64 r0, {lo, hi};" " @p add.s64 r0, r0, %4;" " mov.s64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); #else asm volatile( "{" " .reg .s64 r0;" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.up.b32 lo|p, lo, %2, %3;" " shfl.up.b32 hi|p, hi, %2, %3;" " mov.b64 r0, {lo, hi};" " @p add.s64 r0, r0, %4;" " mov.s64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); #endif return output; } /// Inclusive prefix scan step (specialized for summation across fp64 types) __device__ __forceinline__ double InclusiveScanStep( double input, ///< [in] Calling thread's input item. cub::Sum /*scan_op*/, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { double output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers #ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " .reg .f64 r0;" " mov.b64 %0, %1;" " mov.b64 {lo, hi}, %1;" " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" " mov.b64 r0, {lo, hi};" " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); #else asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " .reg .f64 r0;" " mov.b64 %0, %1;" " mov.b64 {lo, hi}, %1;" " shfl.up.b32 lo|p, lo, %2, %3;" " shfl.up.b32 hi|p, hi, %2, %3;" " mov.b64 r0, {lo, hi};" " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); #endif return output; } /* /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) template __device__ __forceinline__ KeyValuePairInclusiveScanStep( KeyValuePair input, ///< [in] Calling thread's input item. ReduceBySegmentOp scan_op, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { KeyValuePair output; output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); if (input.key > 0) output.value = input.value; return output; } */ /// Inclusive prefix scan step (generic) template __device__ __forceinline__ _T InclusiveScanStep( _T input, ///< [in] Calling thread's input item. ScanOpT scan_op, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { _T temp = ShuffleUp(input, offset, first_lane, member_mask); // Perform scan op if from a valid peer _T output = scan_op(temp, input); if (static_cast(lane_id) < first_lane + offset) output = input; return output; } /// Inclusive prefix scan step (specialized for small integers size 32b or less) template __device__ __forceinline__ _T InclusiveScanStep( _T input, ///< [in] Calling thread's input item. ScanOpT scan_op, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset, ///< [in] Up-offset to pull from Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer { return InclusiveScanStep(input, scan_op, first_lane, offset); } /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) template __device__ __forceinline__ _T InclusiveScanStep( _T input, ///< [in] Calling thread's input item. ScanOpT scan_op, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset, ///< [in] Up-offset to pull from Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer { return InclusiveScanStep(input, scan_op, first_lane, offset); } /****************************************************************************** * Interface ******************************************************************************/ //--------------------------------------------------------------------- // Broadcast //--------------------------------------------------------------------- /// Broadcast __device__ __forceinline__ T Broadcast( T input, ///< [in] The value to broadcast int src_lane) ///< [in] Which warp lane is to do the broadcasting { return ShuffleIndex(input, src_lane, member_mask); } //--------------------------------------------------------------------- // Inclusive operations //--------------------------------------------------------------------- /// Inclusive scan template __device__ __forceinline__ void InclusiveScan( _T input, ///< [in] Calling thread's input item. _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOpT scan_op) ///< [in] Binary scan operator { inclusive_output = input; // Iterate scan steps int segment_first_lane = 0; // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { inclusive_output = InclusiveScanStep( inclusive_output, scan_op, segment_first_lane, (1 << STEP), Int2Type::IS_SMALL_UNSIGNED>()); } } /// Inclusive scan, specialized for reduce-value-by-key template __device__ __forceinline__ void InclusiveScan( KeyValuePair input, ///< [in] Calling thread's input item. KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ReduceByKeyOp scan_op) ///< [in] Binary scan operator { inclusive_output = input; KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); // Mask away all lanes greater than ours ballot = ballot & LaneMaskLe(); // Find index of first set bit int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { inclusive_output.value = InclusiveScanStep( inclusive_output.value, scan_op.op, segment_first_lane, (1 << STEP), Int2Type::IS_SMALL_UNSIGNED>()); } } /// Inclusive scan with aggregate template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOpT scan_op, ///< [in] Binary scan operator T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { InclusiveScan(input, inclusive_output, scan_op); // Grab aggregate from last warp lane warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask); } //--------------------------------------------------------------------- // Get exclusive from inclusive //--------------------------------------------------------------------- /// Update inclusive and exclusive using input and inclusive template __device__ __forceinline__ void Update( T /*input*/, ///< [in] T &inclusive, ///< [in, out] T &exclusive, ///< [out] ScanOpT /*scan_op*/, ///< [in] IsIntegerT /*is_integer*/) ///< [in] { // initial value unknown exclusive = ShuffleUp(inclusive, 1, 0, member_mask); } /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) __device__ __forceinline__ void Update( T input, T &inclusive, T &exclusive, cub::Sum /*scan_op*/, Int2Type /*is_integer*/) { // initial value presumed 0 exclusive = inclusive - input; } /// Update inclusive and exclusive using initial value using input, inclusive, and initial value template __device__ __forceinline__ void Update ( T /*input*/, T &inclusive, T &exclusive, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); exclusive = ShuffleUp(inclusive, 1, 0, member_mask); if (lane_id == 0) exclusive = initial_value; } /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) __device__ __forceinline__ void Update ( T input, T &inclusive, T &exclusive, cub::Sum scan_op, T initial_value, Int2Type /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); exclusive = inclusive - input; } /// Update inclusive, exclusive, and warp aggregate using input and inclusive template __device__ __forceinline__ void Update ( T input, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, IsIntegerT is_integer) { warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); Update(input, inclusive, exclusive, scan_op, is_integer); } /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value template __device__ __forceinline__ void Update ( T input, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT is_integer) { warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); } }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/specializations/warp_scan_smem.cuh000066400000000000000000000364011434614775400227300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. */ #pragma once #include "../../config.cuh" #include "../../thread/thread_operators.cuh" #include "../../thread/thread_load.cuh" #include "../../thread/thread_store.cuh" #include "../../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. */ template < typename T, ///< Data type being scanned int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpScanSmem { /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp scan steps STEPS = Log2::VALUE, /// The number of threads in half a warp HALF_WARP_THREADS = 1 << (STEPS - 1), /// The number of shared memory elements per warp WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, }; /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) using CellT = T; /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /****************************************************************************** * Thread fields ******************************************************************************/ _TempStorage &temp_storage; unsigned int lane_id; unsigned int member_mask; /****************************************************************************** * Construction ******************************************************************************/ /// Constructor explicit __device__ __forceinline__ WarpScanSmem( TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS), member_mask( WarpMask(LaneId() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** * Utility methods ******************************************************************************/ /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) template < bool HAS_IDENTITY, int STEP, typename ScanOp> __device__ __forceinline__ void ScanStep( T &partial, ScanOp scan_op, Int2Type /*step*/) { const int OFFSET = 1 << STEP; // Share partial into buffer ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); WARP_SYNC(member_mask); // Update partial if addend is in range if (HAS_IDENTITY || (lane_id >= OFFSET)) { T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); partial = scan_op(addend, partial); } WARP_SYNC(member_mask); ScanStep(partial, scan_op, Int2Type()); } /// Basic inclusive scan iteration(template unrolled, base-case specialization) template < bool HAS_IDENTITY, typename ScanOp> __device__ __forceinline__ void ScanStep( T &/*partial*/, ScanOp /*scan_op*/, Int2Type /*step*/) {} /// Inclusive prefix scan (specialized for summation across primitive types) __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &output, ///< [out] Calling thread's output item. May be aliased with \p input. Sum scan_op, ///< [in] Binary scan operator Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type { T identity = 0; ThreadStore(&temp_storage[lane_id], (CellT) identity); WARP_SYNC(member_mask); // Iterate scan steps output = input; ScanStep(output, scan_op, Int2Type<0>()); } /// Inclusive prefix scan template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op, ///< [in] Binary scan operator Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type { // Iterate scan steps output = input; ScanStep(output, scan_op, Int2Type<0>()); } /****************************************************************************** * Interface ******************************************************************************/ //--------------------------------------------------------------------- // Broadcast //--------------------------------------------------------------------- /// Broadcast __device__ __forceinline__ T Broadcast( T input, ///< [in] The value to broadcast unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting { if (lane_id == src_lane) { ThreadStore(temp_storage, (CellT) input); } WARP_SYNC(member_mask); return (T)ThreadLoad(temp_storage); } //--------------------------------------------------------------------- // Inclusive operations //--------------------------------------------------------------------- /// Inclusive scan template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op) ///< [in] Binary scan operator { InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); } /// Inclusive scan with aggregate template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op, ///< [in] Binary scan operator T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { InclusiveScan(input, inclusive_output, scan_op); // Retrieve aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); WARP_SYNC(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); WARP_SYNC(member_mask); } //--------------------------------------------------------------------- // Get exclusive from inclusive //--------------------------------------------------------------------- /// Update inclusive and exclusive using input and inclusive template __device__ __forceinline__ void Update( T /*input*/, ///< [in] T &inclusive, ///< [in, out] T &exclusive, ///< [out] ScanOpT /*scan_op*/, ///< [in] IsIntegerT /*is_integer*/) ///< [in] { // initial value unknown ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); } /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) __device__ __forceinline__ void Update( T input, T &inclusive, T &exclusive, cub::Sum /*scan_op*/, Int2Type /*is_integer*/) { // initial value presumed 0 exclusive = inclusive - input; } /// Update inclusive and exclusive using initial value using input, inclusive, and initial value template __device__ __forceinline__ void Update ( T /*input*/, T &inclusive, T &exclusive, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); if (lane_id == 0) exclusive = initial_value; } /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) __device__ __forceinline__ void Update ( T input, T &inclusive, T &exclusive, cub::Sum scan_op, T initial_value, Int2Type /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); exclusive = inclusive - input; } /// Update inclusive, exclusive, and warp aggregate using input and inclusive template __device__ __forceinline__ void Update ( T /*input*/, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/) { // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); } /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) __device__ __forceinline__ void Update ( T input, T &inclusive, T &exclusive, T &warp_aggregate, cub::Sum /*scan_o*/, Int2Type /*is_integer*/) { // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); exclusive = inclusive - input; } /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value template __device__ __forceinline__ void Update ( T /*input*/, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/) { // Broadcast warp aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); WARP_SYNC(member_mask); // Update inclusive with initial value inclusive = scan_op(initial_value, inclusive); // Get exclusive from exclusive ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); if (lane_id == 0) exclusive = initial_value; } }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/warp_exchange.cuh000066400000000000000000000430561434614775400173500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. */ #pragma once #include #include #include CUB_NAMESPACE_BEGIN /** * @brief The WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. * @ingroup WarpModule * * @tparam T * The data type to be exchanged. * * @tparam ITEMS_PER_THREAD * The number of items partitioned onto each thread. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam LEGACY_PTX_ARCH * Unused. * * @par Overview * - It is commonplace for a warp of threads to rearrange data items between * threads. For example, the global memory accesses prefer patterns where * data items are "striped" across threads (where consecutive threads access * consecutive items), yet most warp-wide operations prefer a "blocked" * partitioning of items across threads (where consecutive items belong to a * single thread). * - WarpExchange supports the following types of data exchanges: * - Transposing between [blocked](index.html#sec5sec3) and * [striped](index.html#sec5sec3) arrangements * - Scattering ranked items to a * [striped arrangement](index.html#sec5sec3) * * @par A Simple Example * @par * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads where * each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = * cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. */ template class WarpExchange { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); constexpr static int ITEMS_PER_TILE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1; constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0); // Insert padding if the number of items per thread is a power of two // and > 4 (otherwise we can typically use 128b loads) constexpr static bool INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE); constexpr static int PADDING_ITEMS = INSERT_PADDING ? (ITEMS_PER_TILE >> LOG_SMEM_BANKS) : 0; union _TempStorage { InputT items_shared[ITEMS_PER_TILE + PADDING_ITEMS]; }; // union TempStorage /// Shared storage reference _TempStorage &temp_storage; const unsigned int lane_id; const unsigned int warp_id; const unsigned int member_mask; public: /// \smemstorage{WarpExchange} struct TempStorage : Uninitialized<_TempStorage> {}; /*************************************************************************//** * @name Collective constructors ****************************************************************************/ //@{ WarpExchange() = delete; /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ explicit __device__ __forceinline__ WarpExchange(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { } //@} end member group /*************************************************************************//** * @name Data movement ****************************************************************************/ //@{ /** * @brief Transposes data items from blocked arrangement to * striped arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * The corresponding output @p thread_data in those threads will be * { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * * @param[in] input_items * Items to exchange, converting between blocked and * striped arrangements. * * @param[out] output_items * Items from exchange, converting between striped and * blocked arrangements. May be aliased to @p input_items. */ template __device__ __forceinline__ void BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = ITEMS_PER_THREAD * lane_id + item; temp_storage.items_shared[idx] = input_items[item]; } WARP_SYNC(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = LOGICAL_WARP_THREADS * item + lane_id; output_items[item] = temp_storage.items_shared[idx]; } } /** * @brief Transposes data items from striped arrangement to * blocked arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "striped" to a * "blocked" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. */ template __device__ __forceinline__ void StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = LOGICAL_WARP_THREADS * item + lane_id; temp_storage.items_shared[idx] = input_items[item]; } WARP_SYNC(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = ITEMS_PER_THREAD * lane_id + item; output_items[item] = temp_storage.items_shared[idx]; } } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_data, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_data across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding output @p thread_data in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in,out] items Items to exchange * @param[in] ranks Corresponding scatter ranks */ template __device__ __forceinline__ void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_input[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * int thread_output[items_per_thread]; * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_input, thread_output, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_input across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding @p thread_output in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. * * @param[in] ranks * Corresponding scatter ranks */ template __device__ __forceinline__ void ScatterToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (INSERT_PADDING) { ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); } temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM]; } WARP_SYNC(member_mask); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.items_shared[item_offset]; } } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/warp_load.cuh000066400000000000000000000615601434614775400165050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Operations for reading linear tiles of data into the CUDA warp. */ #pragma once #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief cub::WarpLoadAlgorithm enumerates alternative algorithms for * cub::WarpLoad to read a linear segment of data from memory into a * a CUDA warp. */ enum WarpLoadAlgorithm { /** * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is read * directly from memory. * * @par Performance Considerations * The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ WARP_LOAD_DIRECT, /** * @par Overview * * A [striped arrangement](index.html#sec5sec3) of data is read * directly from memory. * * @par Performance Considerations * The utilization of memory transactions (coalescing) doesn't depend on * the number of items per thread. */ WARP_LOAD_STRIPED, /** * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is read * from memory using CUDA's built-in vectorized loads as a coalescing optimization. * For example, ld.global.v4.s32 instructions will be generated * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0. * * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until the the * access stride between threads (i.e., the number items per thread) exceeds the * maximum vector load width (typically 4 items or 64B, whichever is lower). * - The following conditions will prevent vectorization and loading will fall * back to cub::WARP_LOAD_DIRECT: * - @p ITEMS_PER_THREAD is odd * - The @p InputIteratorT is not a simple pointer type * - The block input offset is not quadword-aligned * - The data type @p T is not a built-in primitive or CUDA vector type * (e.g., @p short, @p int2, @p double, @p float2, etc.) */ WARP_LOAD_VECTORIZE, /** * @par Overview * * A [striped arrangement](index.html#sec5sec3) of data is read * efficiently from memory and then locally transposed into a * [blocked arrangement](index.html#sec5sec3). * * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high * regardless of items loaded per thread. * - The local reordering incurs slightly longer latencies and throughput than * the direct cub::WARP_LOAD_DIRECT and cub::WARP_LOAD_VECTORIZE * alternatives. */ WARP_LOAD_TRANSPOSE }; /** * @brief The WarpLoad class provides [collective](index.html#sec0) * data movement methods for loading a linear segment of items from * memory into a [blocked arrangement](index.html#sec5sec3) * across a CUDA thread block. * @ingroup WarpModule * @ingroup UtilIo * * @tparam InputT * The data type to read into (which must be convertible from the input * iterator's value type). * * @tparam ITEMS_PER_THREAD * The number of consecutive items partitioned onto each thread. * * @tparam ALGORITHM * [optional] cub::WarpLoadAlgorithm tuning policy. * default: cub::WARP_LOAD_DIRECT. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam LEGACY_PTX_ARCH * Unused. * * @par Overview * - The WarpLoad class provides a single data movement abstraction that can be * specialized to implement different cub::WarpLoadAlgorithm strategies. This * facilitates different performance policies for different architectures, data * types, granularity sizes, etc. * - WarpLoad can be optionally specialized by different data movement strategies: * -# cub::WARP_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](@ref cub::WarpLoadAlgorithm) * -# cub::WARP_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](@ref cub::WarpLoadAlgorithm) * -# cub::WARP_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory using CUDA's built-in vectorized * loads as a coalescing optimization. [More...](@ref cub::WarpLoadAlgorithm) * -# cub::WARP_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) * of data is read directly from memory and is then locally transposed into a * [blocked arrangement](index.html#sec5sec3). [More...](@ref cub::WarpLoadAlgorithm) * * @par A Simple Example * @par * The code snippet below illustrates the loading of a linear segment of 64 * integers into a "blocked" arrangement across 16 threads where each thread * owns 4 consecutive items. The load is specialized for @p WARP_LOAD_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each * using WarpLoadT = WarpLoad; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpLoad * __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, * thread_data); * @endcode * @par * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, .... * The set of @p thread_data across the first logical warp of threads in those * threads will be: * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. */ template class WarpLoad { constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); private: /***************************************************************************** * Algorithmic variants ****************************************************************************/ /// Load helper template struct LoadInternal; template struct LoadInternal { using TempStorage = NullType; int linear_tid; __device__ __forceinline__ LoadInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using TempStorage = NullType; int linear_tid; __device__ __forceinline__ LoadInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using TempStorage = NullType; int linear_tid; __device__ __forceinline__ LoadInternal(TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Load( InputT *block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } template __device__ __forceinline__ void Load( const InputT *block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } template < CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT> __device__ __forceinline__ void Load( CacheModifiedInputIterator block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); } template __device__ __forceinline__ void Load( _InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage &temp_storage; int linear_tid; __device__ __forceinline__ LoadInternal( TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()), linear_tid(linear_tid) {} template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } template __device__ __forceinline__ void Load( InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } }; /***************************************************************************** * Type definitions ****************************************************************************/ /// Internal load implementation to use using InternalLoad = LoadInternal; /// Shared memory storage layout type using _TempStorage = typename InternalLoad::TempStorage; /***************************************************************************** * Utility methods ****************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /***************************************************************************** * Thread fields ****************************************************************************/ /// Thread reference to shared storage _TempStorage &temp_storage; /// Linear thread-id int linear_tid; public: /// @smemstorage{WarpLoad} struct TempStorage : Uninitialized<_TempStorage> {}; /*************************************************************************//** * @name Collective constructors ****************************************************************************/ //@{ /** * @brief Collective constructor using a private static allocation of * shared memory as temporary storage. */ __device__ __forceinline__ WarpLoad() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ __device__ __forceinline__ WarpLoad(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //@} end member group /*************************************************************************//** * @name Data movement ****************************************************************************/ //@{ /** * @brief Load a linear segment of items from memory. * * @par * \smemreuse * * @par Snippet * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each * using WarpLoadT = WarpLoad; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpLoad * __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, * thread_data); * @endcode * @par * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, .... * The set of @p thread_data across the first logical warp of threads in those * threads will be: * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. * * @param[in] block_itr The thread block's base input iterator for loading from * @param[out] items Data to load */ template __device__ __forceinline__ void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items); } /** * @brief Load a linear segment of items from memory, guarded by range. * * @par * \smemreuse * * @par Snippet * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each * using WarpLoadT = WarpLoad; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpLoad * __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, * thread_data, * valid_items); * @endcod * @par * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, ... and @p valid_items * is @p 5. * The set of @p thread_data across the first logical warp of threads in those * threads will be: * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] } with only the first * two threads being unmasked to load portions of valid data (and other items * remaining unassigned). * * @param[in] block_itr The thread block's base input iterator for loading from * @param[out] items Data to load * @param[in] valid_items Number of valid items to load */ template __device__ __forceinline__ void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); } /** * @brief Load a linear segment of items from memory, guarded by range. * * @par * \smemreuse * * @par Snippet * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each * using WarpLoadT = WarpLoad; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpLoad * __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; * * // Load a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, * thread_data, * valid_items, * -1); * @endcode * @par * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, ..., @p valid_items * is @p 5, and the out-of-bounds default is @p -1. * The set of @p thread_data across the first logical warp of threads in those * threads will be: * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] } with only the first * two threads being unmasked to load portions of valid data (and other items * are assigned @p -1). * * @param[in] block_itr The thread block's base input iterator for loading from * @param[out] items Data to load * @param[in] valid_items Number of valid items to load * @param[in] oob_default Default value to assign out-of-bound items */ template __device__ __forceinline__ void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { InternalLoad(temp_storage, linear_tid) .Load(block_itr, items, valid_items, oob_default); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/warp_merge_sort.cuh000066400000000000000000000143051434614775400177270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief The WarpMergeSort class provides methods for sorting items partitioned * across a CUDA warp using a merge sorting method. * @ingroup WarpModule * * @tparam KeyT * Key type * * @tparam ITEMS_PER_THREAD * The number of items per thread * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam ValueT * [optional] Value type (default: cub::NullType, which indicates a * keys-only sort) * * @tparam LEGACY_PTX_ARCH * Unused. * * @par Overview * WarpMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types * and comparison functors. * * @par A Simple Example * @par * The code snippet below illustrates a sort of 64 integer keys that are * partitioned across 16 threads where each thread owns 4 consecutive items. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * __global__ void ExampleKernel(...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpMergeSort for a virtual warp of 16 threads * // owning 4 integer items each * using WarpMergeSortT = * cub::WarpMergeSort; * * // Allocate shared memory for WarpMergeSort * __shared__ typename WarpMergeSortT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[items_per_thread]; * // ... * * WarpMergeSortT(temp_storage[warp_id]).Sort(thread_keys, CustomLess()); * // ... * } * @endcode * @par * Suppose the set of input @p thread_keys across a warp of threads is * `{ [0,64,1,63], [2,62,3,61], [4,60,5,59], ..., [31,34,32,33] }`. * The corresponding output @p thread_keys in those threads will be * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [31,32,33,34] }`. */ template < typename KeyT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_WARP_THREADS(0), typename ValueT = NullType, int LEGACY_PTX_ARCH = 0> class WarpMergeSort : public BlockMergeSortStrategy< KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, WarpMergeSort> { private: constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); constexpr static bool KEYS_ONLY = std::is_same::value; constexpr static int TILE_SIZE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS; using BlockMergeSortStrategyT = BlockMergeSortStrategy; const unsigned int warp_id; const unsigned int member_mask; public: WarpMergeSort() = delete; __device__ __forceinline__ WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage &temp_storage) : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { } __device__ __forceinline__ unsigned int get_member_mask() const { return member_mask; } private: __device__ __forceinline__ void SyncImplementation() const { WARP_SYNC(member_mask); } friend BlockMergeSortStrategyT; }; CUB_NAMESPACE_END cub-2.0.1/cub/warp/warp_reduce.cuh000066400000000000000000000633551434614775400170410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. */ #pragma once #include "../config.cuh" #include "specializations/warp_reduce_shfl.cuh" #include "specializations/warp_reduce_smem.cuh" #include "../thread/thread_operators.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup WarpModule * @{ */ /** * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) * * \tparam T The reduction input/output element type * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A reduction (or fold) * uses a binary combining operator to compute a single aggregate from a list of input elements. * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS * * \par Performance Considerations * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) * - Uses synchronization-free communication between warp lanes when applicable * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Summation (vs. generic reduction) * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS * * \par Simple Examples * \warpcollective{WarpReduce} * \par * The code snippet below illustrates four concurrent warp sum reductions within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for 4 warps * __shared__ typename WarpReduce::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) * int warp_id = threadIdx.x / 32; * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, * \p 2544, and \p 3568, respectively (and is undefined in other threads). * * \par * The code snippet below illustrates a single warp sum reduction within a block of * 128 threads. * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * ... * * // Only the first warp performs a reduction * if (threadIdx.x < 32) * { * // Obtain one input item per thread * int thread_data = ... * * // Return the warp-wide sum to lane0 * int aggregate = WarpReduce(temp_storage).Sum(thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). * */ template < typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int LEGACY_PTX_ARCH = 0> class WarpReduce { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, }; public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpReduce = cub::detail::conditional_t< IS_POW_OF_TWO, WarpReduceShfl, WarpReduceSmem>; #endif // DOXYGEN_SHOULD_SKIP_THIS private: /// Shared memory storage layout type for WarpReduce typedef typename InternalWarpReduce::TempStorage _TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /****************************************************************************** * Utility methods ******************************************************************************/ public: /// \smemstorage{WarpReduce} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. */ __device__ __forceinline__ WarpReduce( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()) {} //@} end member group /******************************************************************//** * \name Summation reductions *********************************************************************/ //@{ /** * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. * * \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp sum reductions within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for 4 warps * __shared__ typename WarpReduce::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Return the warp-wide sums to each lane0 * int warp_id = threadIdx.x / 32; * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, * \p 2544, and \p 3568, respectively (and is undefined in other threads). * */ __device__ __forceinline__ T Sum( T input) ///< [in] Calling thread's input { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); } /** * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. * * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. * * \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction within a single, partially-full * block of 32 threads (one warp). * \par * \code * #include * * __global__ void ExampleKernel(int *d_data, int valid_items) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * * // Obtain one input item per thread if in range * int thread_data; * if (threadIdx.x < valid_items) * thread_data = d_data[threadIdx.x]; * * // Return the warp-wide sums to each lane0 * int aggregate = WarpReduce(temp_storage).Sum( * thread_data, valid_items); * * \endcode * \par * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is * undefined in other threads). * */ __device__ __forceinline__ T Sum( T input, ///< [in] Calling thread's input int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) { // Determine if we don't need bounds checking return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); } /** * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). * * \smemreuse * * \par Snippet * The code snippet below illustrates a head-segmented warp sum * reduction within a block of 32 threads (one warp). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * * // Obtain one input item and flag per thread * int thread_data = ... * int head_flag = ... * * // Return the warp-wide sums to each lane0 * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( * thread_data, head_flag); * * \endcode * \par * Suppose the set of input \p thread_data and \p head_flag across the block of threads * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). * * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) * */ template < typename FlagT> __device__ __forceinline__ T HeadSegmentedSum( T input, ///< [in] Calling thread's input FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment { return HeadSegmentedReduce(input, head_flag, cub::Sum()); } /** * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). * * \smemreuse * * \par Snippet * The code snippet below illustrates a tail-segmented warp sum * reduction within a block of 32 threads (one warp). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * * // Obtain one input item and flag per thread * int thread_data = ... * int tail_flag = ... * * // Return the warp-wide sums to each lane0 * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( * thread_data, tail_flag); * * \endcode * \par * Suppose the set of input \p thread_data and \p tail_flag across the block of threads * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). * * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < typename FlagT> __device__ __forceinline__ T TailSegmentedSum( T input, ///< [in] Calling thread's input FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment { return TailSegmentedReduce(input, tail_flag, cub::Sum()); } //@} end member group /******************************************************************//** * \name Generic reductions *********************************************************************/ //@{ /** * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. * * Supports non-commutative reduction operators * * \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp max reductions within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for 4 warps * __shared__ typename WarpReduce::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Return the warp-wide reductions to each lane0 * int warp_id = threadIdx.x / 32; * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( * thread_data, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, * \p 95, and \p 127, respectively (and is undefined in other threads). * * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input ReductionOp reduction_op) ///< [in] Binary reduction operator { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); } /** * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. * * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. * * Supports non-commutative reduction operators * * \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction within a single, partially-full * block of 32 threads (one warp). * \par * \code * #include * * __global__ void ExampleKernel(int *d_data, int valid_items) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * * // Obtain one input item per thread if in range * int thread_data; * if (threadIdx.x < valid_items) * thread_data = d_data[threadIdx.x]; * * // Return the warp-wide reductions to each lane0 * int aggregate = WarpReduce(temp_storage).Reduce( * thread_data, cub::Max(), valid_items); * * \endcode * \par * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is * undefined in other threads). * * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input ReductionOp reduction_op, ///< [in] Binary reduction operator int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) { return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); } /** * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). * * Supports non-commutative reduction operators * * \smemreuse * * \par Snippet * The code snippet below illustrates a head-segmented warp max * reduction within a block of 32 threads (one warp). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * * // Obtain one input item and flag per thread * int thread_data = ... * int head_flag = ... * * // Return the warp-wide reductions to each lane0 * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( * thread_data, head_flag, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data and \p head_flag across the block of threads * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). * * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < typename ReductionOp, typename FlagT> __device__ __forceinline__ T HeadSegmentedReduce( T input, ///< [in] Calling thread's input FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment ReductionOp reduction_op) ///< [in] Reduction operator { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); } /** * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). * * Supports non-commutative reduction operators * * \smemreuse * * \par Snippet * The code snippet below illustrates a tail-segmented warp max * reduction within a block of 32 threads (one warp). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpReduce for type int * typedef cub::WarpReduce WarpReduce; * * // Allocate WarpReduce shared memory for one warp * __shared__ typename WarpReduce::TempStorage temp_storage; * * // Obtain one input item and flag per thread * int thread_data = ... * int tail_flag = ... * * // Return the warp-wide reductions to each lane0 * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( * thread_data, tail_flag, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data and \p tail_flag across the block of threads * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). * * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ template < typename ReductionOp, typename FlagT> __device__ __forceinline__ T TailSegmentedReduce( T input, ///< [in] Calling thread's input FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment ReductionOp reduction_op) ///< [in] Reduction operator { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); } //@} end member group }; template class WarpReduce { private: using _TempStorage = cub::NullType; public: struct TempStorage : Uninitialized<_TempStorage> {}; __device__ __forceinline__ WarpReduce(TempStorage & /*temp_storage */) {} __device__ __forceinline__ T Sum(T input) { return input; } __device__ __forceinline__ T Sum(T input, int /* valid_items */) { return input; } template __device__ __forceinline__ T HeadSegmentedSum(T input, FlagT /* head_flag */) { return input; } template __device__ __forceinline__ T TailSegmentedSum(T input, FlagT /* tail_flag */) { return input; } template __device__ __forceinline__ T Reduce(T input, ReductionOp /* reduction_op */) { return input; } template __device__ __forceinline__ T Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */) { return input; } template __device__ __forceinline__ T HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */) { return input; } template __device__ __forceinline__ T TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */) { return input; } }; /** @} */ // end group WarpModule CUB_NAMESPACE_END cub-2.0.1/cub/warp/warp_scan.cuh000066400000000000000000001134251434614775400165100ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. */ #pragma once #include "../config.cuh" #include "specializations/warp_scan_shfl.cuh" #include "specializations/warp_scan_smem.cuh" #include "../thread/thread_operators.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN /** * \addtogroup WarpModule * @{ */ /** * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) * * \tparam T The scan input/output element type * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) * produces an output list where each element is computed to be the reduction * of the elements occurring earlier in the input list. Prefix sum * connotes a prefix scan with the addition operator. The term \em inclusive indicates * that the ith output reduction incorporates the ith input. * The term \em exclusive indicates the ith input is not incorporated into * the ith output reduction. * - Supports non-commutative scan operators * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS * * \par Performance Considerations * - Uses special instructions when applicable (e.g., warp \p SHFL) * - Uses synchronization-free communication between warp lanes when applicable * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Summation (vs. generic scan) * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS * * \par Simple Examples * \warpcollective{WarpScan} * \par * The code snippet below illustrates four concurrent warp prefix sums within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute warp-wide prefix sums * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. * The corresponding output \p thread_data in each of the four warps of threads will be * 0, 1, 2, 3, ..., 31}. * * \par * The code snippet below illustrates a single warp prefix sum within a block of * 128 threads. * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for one warp * __shared__ typename WarpScan::TempStorage temp_storage; * ... * * // Only the first warp performs a prefix sum * if (threadIdx.x < 32) * { * // Obtain one input item per thread * int thread_data = ... * * // Compute warp-wide prefix sums * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. * */ template < typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int LEGACY_PTX_ARCH = 0> class WarpScan { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), /// Whether the data type is an integer (which has fully-associative addition) IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) }; /// Internal specialization. /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpScan = cub::detail::conditional_t< IS_POW_OF_TWO, WarpScanShfl, WarpScanSmem>; /// Shared memory storage layout type for WarpScan typedef typename InternalWarpScan::TempStorage _TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; unsigned int lane_id; /****************************************************************************** * Public types ******************************************************************************/ public: /// \smemstorage{WarpScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. */ __device__ __forceinline__ WarpScan( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) {} //@} end member group /******************************************************************//** * \name Inclusive prefix sums *********************************************************************/ //@{ /** * \brief Computes an inclusive prefix sum across the calling warp. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute inclusive warp-wide prefix sums * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. * The corresponding output \p thread_data in each of the four warps of threads will be * 1, 2, 3, ..., 32}. */ __device__ __forceinline__ void InclusiveSum( T input, ///< [in] Calling thread's input item. T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. { InclusiveScan(input, inclusive_output, cub::Sum()); } /** * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute inclusive warp-wide prefix sums * int warp_aggregate; * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. * The corresponding output \p thread_data in each of the four warps of threads will be * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. */ __device__ __forceinline__ void InclusiveSum( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); } //@} end member group /******************************************************************//** * \name Exclusive prefix sums *********************************************************************/ //@{ /** * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. * * \par * - \identityzero * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix sums * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. * The corresponding output \p thread_data in each of the four warps of threads will be * 0, 1, 2, ..., 31}. * */ __device__ __forceinline__ void ExclusiveSum( T input, ///< [in] Calling thread's input item. T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. { T initial_value = 0; ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); } /** * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. * * \par * - \identityzero * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix sums * int warp_aggregate; * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. * The corresponding output \p thread_data in each of the four warps of threads will be * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. */ __device__ __forceinline__ void ExclusiveSum( T input, ///< [in] Calling thread's input item. T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { T initial_value = 0; ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); } //@} end member group /******************************************************************//** * \name Inclusive prefix scans *********************************************************************/ //@{ /** * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute inclusive warp-wide prefix max scans * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p thread_data in the first warp would be * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op) ///< [in] Binary scan operator { InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); } /** * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute inclusive warp-wide prefix max scans * int warp_aggregate; * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).InclusiveScan( * thread_data, thread_data, cub::Max(), warp_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p thread_data in the first warp would be * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads * in the second warp, etc. * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op, ///< [in] Binary scan operator T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); } //@} end member group /******************************************************************//** * \name Exclusive prefix scans *********************************************************************/ //@{ /** * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix max scans * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p thread_data in the first warp would be * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. * (The output \p thread_data in warp lane0 is undefined.) * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item. T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op) ///< [in] Binary scan operator { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, scan_op, Int2Type()); } /** * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix max scans * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p thread_data in the first warp would be * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item. T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. T initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op) ///< [in] Binary scan operator { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type()); } /** * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix max scans * int warp_aggregate; * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p thread_data in the first warp would be * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads * in the second warp, etc. * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item. T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. ScanOp scan_op, ///< [in] Binary scan operator T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, warp_aggregate, scan_op, Int2Type()); } /** * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix max scans * int warp_aggregate; * int warp_id = threadIdx.x / 32; * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p thread_data in the first warp would be * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads * in the second warp, etc. * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item. T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. T initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op, ///< [in] Binary scan operator T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, warp_aggregate, scan_op, initial_value, Int2Type()); } //@} end member group /******************************************************************//** * \name Combination (inclusive & exclusive) prefix scans *********************************************************************/ //@{ /** * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute exclusive warp-wide prefix max scans * int inclusive_partial, exclusive_partial; * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p inclusive_partial in the first warp would be * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. * The corresponding output \p exclusive_partial in the first warp would be * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. * (The output \p thread_data in warp lane0 is undefined.) * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void Scan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. ScanOp scan_op) ///< [in] Binary scan operator { InternalWarpScan internal(temp_storage); internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, scan_op, Int2Type()); } /** * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of * 128 threads (one per each of the 32-thread warps). * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Compute inclusive warp-wide prefix max scans * int warp_id = threadIdx.x / 32; * int inclusive_partial, exclusive_partial; * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. * The corresponding output \p inclusive_partial in the first warp would be * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. * The corresponding output \p exclusive_partial in the first warp would be * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. * * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ void Scan( T input, ///< [in] Calling thread's input item. T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. T initial_value, ///< [in] Initial value to seed the exclusive scan ScanOp scan_op) ///< [in] Binary scan operator { InternalWarpScan internal(temp_storage); internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type()); } //@} end member group /******************************************************************//** * \name Data exchange *********************************************************************/ //@{ /** * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp * * \par * - \smemreuse * * \par Snippet * The code snippet below illustrates the warp-wide broadcasts of values from * lanes0 in each of four warps to all other threads in those warps. * \par * \code * #include * * __global__ void ExampleKernel(...) * { * // Specialize WarpScan for type int * typedef cub::WarpScan WarpScan; * * // Allocate WarpScan shared memory for 4 warps * __shared__ typename WarpScan::TempStorage temp_storage[4]; * * // Obtain one input item per thread * int thread_data = ... * * // Broadcast from lane0 in each warp to all other threads in the warp * int warp_id = threadIdx.x / 32; * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); * * \endcode * \par * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. * The corresponding output \p thread_data will be * {0, 0, ..., 0} in warp0, * {32, 32, ..., 32} in warp1, * {64, 64, ..., 64} in warp2, etc. */ __device__ __forceinline__ T Broadcast( T input, ///< [in] The value to broadcast unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting { return InternalWarpScan(temp_storage).Broadcast(input, src_lane); } //@} end member group }; /** @} */ // end group WarpModule CUB_NAMESPACE_END cub-2.0.1/cub/warp/warp_store.cuh000066400000000000000000000475731434614775400167320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Operations for writing linear segments of data from the CUDA warp */ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for * cub::WarpStore to write a blocked arrangement of items across a CUDA * warp to a linear segment of memory. */ enum WarpStoreAlgorithm { /** * @par Overview * A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. * * @par Performance Considerations * The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ WARP_STORE_DIRECT, /** * @par Overview * A [striped arrangement](index.html#sec5sec3) of data is written * directly to memory. * * @par Performance Considerations * The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. */ WARP_STORE_STRIPED, /** * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory using CUDA's built-in vectorized stores as a coalescing * optimization. For example, st.global.v4.s32 instructions will be * generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0. * * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until * the the access stride between threads (i.e., the number items per thread) * exceeds the maximum vector store width (typically 4 items or 64B, * whichever is lower). * - The following conditions will prevent vectorization and writing will fall * back to cub::WARP_STORE_DIRECT: * - @p ITEMS_PER_THREAD is odd * - The @p OutputIteratorT is not a simple pointer type * - The block output offset is not quadword-aligned * - The data type @p T is not a built-in primitive or CUDA vector type * (e.g., @p short, @p int2, @p double, @p float2, etc.) */ WARP_STORE_VECTORIZE, /** * @par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a * [striped arrangement](index.html#sec5sec3). * * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high * regardless of items written per thread. * - The local reordering incurs slightly longer latencies and throughput than the * direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives. */ WARP_STORE_TRANSPOSE }; /** * @brief The WarpStore class provides [collective](index.html#sec0) * data movement methods for writing a [blocked arrangement](index.html#sec5sec3) * of items partitioned across a CUDA warp to a linear segment of memory. * @ingroup WarpModule * @ingroup UtilIo * * @tparam T * The type of data to be written. * * @tparam ITEMS_PER_THREAD * The number of consecutive items partitioned onto each thread. * * @tparam ALGORITHM * [optional] cub::WarpStoreAlgorithm tuning policy enumeration. * default: cub::WARP_STORE_DIRECT. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam LEGACY_PTX_ARCH * Unused. * * @par Overview * - The WarpStore class provides a single data movement abstraction that can be * specialized to implement different cub::WarpStoreAlgorithm strategies. This * facilitates different performance policies for different architectures, * data types, granularity sizes, etc. * - WarpStore can be optionally specialized by different data movement strategies: * -# cub::WARP_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm) * -# cub::WARP_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3) * of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm) * -# cub::WARP_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory using CUDA's built-in vectorized * stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm) * -# cub::WARP_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) * is locally transposed into a [striped arrangement](index.html#sec5sec3) * which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm) * - \rowmajor * * @par A Simple Example * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 64 integers across 16 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so * that memory references will be efficiently coalesced using a warp-striped * access pattern. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each * using WarpStoreT = WarpStore; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpStore * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); * @endcode * @par * Suppose the set of @p thread_data across the warp threads is * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... */ template class WarpStore { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); private: /// Store helper template struct StoreInternal; template struct StoreInternal { typedef NullType TempStorage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { typedef NullType TempStorage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectStriped(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { typedef NullType TempStorage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} __device__ __forceinline__ void Store(T *block_ptr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage &temp_storage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; /// Internal load implementation to use using InternalStore = StoreInternal; /// Shared memory storage layout type using _TempStorage = typename InternalStore::TempStorage; __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } _TempStorage &temp_storage; int linear_tid; public: struct TempStorage : Uninitialized<_TempStorage> {}; /*************************************************************************//** * @name Collective constructors ****************************************************************************/ //@{ /** * @brief Collective constructor using a private static allocation of shared * memory as temporary storage. */ __device__ __forceinline__ WarpStore() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ __device__ __forceinline__ WarpStore(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //@} end member group /*************************************************************************//** * @name Data movement ****************************************************************************/ //@{ /** * @brief Store items into a linear segment of memory. * * @par * \smemreuse * * @par Snippet * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 64 integers across 16 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so * that memory references will be efficiently coalesced using a warp-striped * access pattern. * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each * using WarpStoreT = WarpStore; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpStore * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); * @endcode * @par * Suppose the set of @p thread_data across the warp threads is * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... * * @param[out] block_itr The thread block's base output iterator for storing to * @param[in] items Data to store */ template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range. * * @par * \smemreuse * * @par Snippet * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 64 integers across 16 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so * that memory references will be efficiently coalesced using a warp-striped * access pattern. * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each * using WarpStoreT = WarpStore; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpStore * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * WarpStoreT(temp_storage[warp_id]).Store( * d_data + warp_id * tile_size, thread_data, valid_items); * @endcode * @par * Suppose the set of @p thread_data across the warp threads is * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] } and @p valid_items * is @p 5.. The output @p d_data will be 0, 1, 2, 3, 4, ?, ?, ..., * with only the first two threads being unmasked to store portions of valid * data. * * @param[out] block_itr The thread block's base output iterator for storing to * @param[in] items Data to store * @param[in] valid_items Number of valid items to write */ template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } //@} end member group }; CUB_NAMESPACE_END cub-2.0.1/examples/000077500000000000000000000000001434614775400141205ustar00rootroot00000000000000cub-2.0.1/examples/CMakeLists.txt000066400000000000000000000047321434614775400166660ustar00rootroot00000000000000# Create meta targets that build all examples for a single configuration: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) set(config_meta_target ${config_prefix}.examples) add_custom_target(${config_meta_target}) add_dependencies(${config_prefix}.all ${config_meta_target}) endforeach() # Update flags to reflect RDC options. See note in CubCudaConfig.cmake -- # these flag variables behave unintuitively: if (CUB_ENABLE_EXAMPLES_WITH_RDC) set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}") else() set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}") endif() ## cub_add_example # # Add an example executable and register it with ctest. # # target_name_var: Variable name to overwrite with the name of the example # target. Useful for post-processing target information per-backend. # example_name: The name of the example minus ".example." For # instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu # would be "cuda.copy". # example_src: The source file that implements the example. # cub_target: The reference cub target with configuration information. # function(cub_add_example target_name_var example_name example_src cub_target) cub_get_target_property(config_prefix ${cub_target} PREFIX) # The actual name of the test's target: set(example_target ${config_prefix}.example.${example_name}) set(${target_name_var} ${example_target} PARENT_SCOPE) # Related target names: set(config_meta_target ${config_prefix}.examples) set(example_meta_target cub.all.example.${example_name}) add_executable(${example_target} "${example_src}") target_link_libraries(${example_target} ${cub_target}) cub_clone_target_properties(${example_target} ${cub_target}) target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples") # Add to the active configuration's meta target add_dependencies(${config_meta_target} ${example_target}) # Meta target that builds examples with this name for all configurations: if (NOT TARGET ${example_meta_target}) add_custom_target(${example_meta_target}) endif() add_dependencies(${example_meta_target} ${example_target}) if (CUB_ENABLE_EXAMPLES_WITH_RDC) cub_enable_rdc_for_cuda_target(${example_target}) endif() add_test(NAME ${example_target} COMMAND "$" ) endfunction() add_subdirectory(cmake) add_subdirectory(block) add_subdirectory(device) cub-2.0.1/examples/block/000077500000000000000000000000001434614775400152125ustar00rootroot00000000000000cub-2.0.1/examples/block/.gitignore000066400000000000000000000001051434614775400171760ustar00rootroot00000000000000/bin /Debug /Release /cuda55.sdf /cuda55.suo /cuda60.sdf /cuda60.suo cub-2.0.1/examples/block/CMakeLists.txt000066400000000000000000000007271434614775400177600ustar00rootroot00000000000000file(GLOB_RECURSE example_srcs RELATIVE "${CMAKE_CURRENT_LIST_DIR}" CONFIGURE_DEPENDS example_*.cu ) foreach (cub_target IN LISTS CUB_TARGETS) foreach (example_src IN LISTS example_srcs) get_filename_component(example_name "${example_src}" NAME_WE) string(REGEX REPLACE "^example_block_" "block." example_name "${example_name}" ) cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) endforeach() endforeach() cub-2.0.1/examples/block/example_block_radix_sort.cu000066400000000000000000000247701434614775400226200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockRadixSort * * To compile using the command line: * nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; /// Uniform key samples bool g_uniform_keys; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide sorting over integers */ template < typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD> __launch_bounds__ (BLOCK_THREADS) __global__ void BlockSortKernel( Key *d_in, // Tile of input Key *d_out, // Tile of output clock_t *d_elapsed) // Elapsed cycle count of block scan { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) typedef BlockLoad BlockLoadT; // Specialize BlockRadixSort type for our thread block typedef BlockRadixSort BlockRadixSortT; // Shared memory __shared__ union TempStorage { typename BlockLoadT::TempStorage load; typename BlockRadixSortT::TempStorage sort; } temp_storage; // Per-thread tile items Key items[ITEMS_PER_THREAD]; // Our current block's offset int block_offset = blockIdx.x * TILE_SIZE; // Load items into a blocked arrangement BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); // Barrier for smem reuse __syncthreads(); // Start cycle timer clock_t start = clock(); // Sort keys BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items); // Stop cycle timer clock_t stop = clock(); // Store output in striped fashion StoreDirectStriped(threadIdx.x, d_out + block_offset, items); // Store elapsed clocks if (threadIdx.x == 0) { d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize sorting problem (and solution). */ template void Initialize( Key *h_in, Key *h_reference, int num_items, int tile_size) { for (int i = 0; i < num_items; ++i) { if (g_uniform_keys) { h_in[i] = 0; } else { RandomBits(h_in[i]); } h_reference[i] = h_in[i]; } // Only sort the first tile std::sort(h_reference, h_reference + tile_size); } /** * Test BlockScan */ template < typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD> void Test() { const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays Key *h_in = new Key[TILE_SIZE * g_grid_size]; Key *h_reference = new Key[TILE_SIZE * g_grid_size]; clock_t *h_elapsed = new clock_t[g_grid_size]; // Initialize problem and reference output on host Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE); // Initialize device arrays Key *d_in = NULL; Key *d_out = NULL; clock_t *d_elapsed = NULL; CubDebugExit(cudaMalloc((void**)&d_in, sizeof(Key) * TILE_SIZE * g_grid_size)); CubDebugExit(cudaMalloc((void**)&d_out, sizeof(Key) * TILE_SIZE * g_grid_size)); CubDebugExit(cudaMalloc((void**)&d_elapsed, sizeof(clock_t) * g_grid_size)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) std::cout << h_in[i] << ", "; printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel, BLOCK_THREADS)); // Copy problem to device CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice)); printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); fflush(stdout); // Run kernel once to prime caches and check result BlockSortKernel<<>>( d_in, d_out, d_elapsed); // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check results printf("\tOutput items: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); fflush(stdout); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; unsigned long long elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { timer.Start(); // Run kernel BlockSortKernel<<>>( d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost)); for (int j = 0; j < g_grid_size; j++) { elapsed_clocks += h_elapsed[j]; } } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; double avg_clocks = double(elapsed_clocks) / g_timing_iterations / g_grid_size; double avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks); printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); fflush(stdout); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_elapsed) delete[] h_elapsed; if (d_in) CubDebugExit(cudaFree(d_in)); if (d_out) CubDebugExit(cudaFree(d_out)); if (d_elapsed) CubDebugExit(cudaFree(d_elapsed)); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_uniform_keys = args.CheckCmdLineFlag("uniform"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=]" "[--grid-size=]" "[--v] " "\n", argv[0], g_timing_iterations, g_grid_size); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); fflush(stdout); // Run tests printf("\nuint32:\n"); fflush(stdout); Test(); printf("\n"); fflush(stdout); printf("\nfp32:\n"); fflush(stdout); Test(); printf("\n"); fflush(stdout); printf("\nuint8:\n"); fflush(stdout); Test(); printf("\n"); fflush(stdout); return 0; } cub-2.0.1/examples/block/example_block_reduce.cu000066400000000000000000000225761434614775400217130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockReduce * * To compile using the command line: * nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide reduction. */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockReduceAlgorithm ALGORITHM> __global__ void BlockReduceKernel( int *d_in, // Tile of input int *d_out, // Tile aggregate clock_t *d_elapsed) // Elapsed cycle count of block reduction { // Specialize BlockReduce type for our thread block typedef BlockReduce BlockReduceT; // Shared memory __shared__ typename BlockReduceT::TempStorage temp_storage; // Per-thread tile data int data[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in, data); // Start cycle timer clock_t start = clock(); // Compute sum int aggregate = BlockReduceT(temp_storage).Sum(data); // Stop cycle timer clock_t stop = clock(); // Store aggregate and elapsed clocks if (threadIdx.x == 0) { *d_elapsed = (start > stop) ? start - stop : stop - start; *d_out = aggregate; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize reduction problem (and solution). * Returns the aggregate */ int Initialize(int *h_in, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; inclusive += h_in[i]; } return inclusive; } /** * Test thread block reduction */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockReduceAlgorithm ALGORITHM> void Test() { const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays int *h_in = new int[TILE_SIZE]; int *h_gpu = new int[TILE_SIZE + 1]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, TILE_SIZE); // Initialize device arrays int *d_in = NULL; int *d_out = NULL; clock_t *d_elapsed = NULL; cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE); cudaMalloc((void**)&d_out, sizeof(int) * 1); cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) printf("%d, ", h_in[i]); printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel, BLOCK_THREADS)); // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS", TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); // Run kernel BlockReduceKernel<<>>( d_in, d_out, d_elapsed); // Check total aggregate printf("\tAggregate: "); int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; clock_t elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); timer.Start(); // Run kernel BlockReduceKernel<<>>( d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device clock_t clocks; CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; float avg_clocks = float(elapsed_clocks) / g_timing_iterations; float avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks); printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); // Cleanup if (h_in) delete[] h_in; if (h_gpu) delete[] h_gpu; if (d_in) cudaFree(d_in); if (d_out) cudaFree(d_out); if (d_elapsed) cudaFree(d_elapsed); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=] " "[--grid-size=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_REDUCE_RAKING>(); Test<512, 2, BLOCK_REDUCE_RAKING>(); Test<256, 4, BLOCK_REDUCE_RAKING>(); Test<128, 8, BLOCK_REDUCE_RAKING>(); Test<64, 16, BLOCK_REDUCE_RAKING>(); Test<32, 32, BLOCK_REDUCE_RAKING>(); Test<16, 64, BLOCK_REDUCE_RAKING>(); printf("-------------\n"); Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>(); return 0; } cub-2.0.1/examples/block/example_block_reduce_dyn_smem.cu000066400000000000000000000162371434614775400236030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockReduce with dynamic shared memory * * To compile using the command line: * nvcc -arch=sm_XX example_block_reduce_dyn_smem.cu -I../.. -lcudart -O3 -std=c++14 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include #include "../../test/test_util.h" // Some implementation details rely on c++14 #if CUB_CPP_DIALECT >= 2014 using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide reduction. */ template __global__ void BlockReduceKernel( int *d_in, // Tile of input int *d_out // Tile aggregate ) { // Specialize BlockReduce type for our thread block using BlockReduceT = cub::BlockReduce; using TempStorageT = typename BlockReduceT::TempStorage; union ShmemLayout { TempStorageT reduce; int aggregate; }; // shared memory byte-array extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; // cast to lvalue reference of expected type auto& temp_storage = reinterpret_cast(smem); int data = d_in[threadIdx.x]; // Compute sum int aggregate = BlockReduceT(temp_storage).Sum(data); // block-wide sync barrier necessary to re-use shared mem safely __syncthreads(); int* smem_integers = reinterpret_cast(smem); if (threadIdx.x == 0) smem_integers[0] = aggregate; // sync to make new shared value available to all threads __syncthreads(); aggregate = smem_integers[0]; // all threads write the aggregate to output d_out[threadIdx.x] = aggregate; } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize reduction problem (and solution). * Returns the aggregate */ int Initialize(int *h_in, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; inclusive += h_in[i]; } return inclusive; } /** * Test thread block reduction */ template void Test() { // Allocate host arrays int *h_in = new int[BLOCK_THREADS]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, BLOCK_THREADS); // Initialize device arrays int *d_in = NULL; int *d_out = NULL; cudaMalloc((void**)&d_in, sizeof(int) * BLOCK_THREADS); cudaMalloc((void**)&d_out, sizeof(int) * BLOCK_THREADS); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < BLOCK_THREADS; i++) printf("%d, ", h_in[i]); printf("\n\n"); } // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice); // determine necessary storage size: auto block_reduce_temp_bytes = sizeof(typename cub::BlockReduce::TempStorage); // finally, we need to make sure that we can hold at least one integer // needed in the kernel to exchange data after reduction auto smem_size = (std::max)(1 * sizeof(int), block_reduce_temp_bytes); // use default stream cudaStream_t stream = NULL; // Run reduction kernel BlockReduceKernel <<>>( d_in, d_out); // Check total aggregate printf("\tAggregate: "); int compare = 0; for (int i = 0; i < BLOCK_THREADS; i++) { compare = compare || CompareDeviceResults( &h_aggregate, d_out + i, 1, g_verbose, g_verbose); } printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Cleanup if (h_in) delete[] h_in; if (d_in) cudaFree(d_in); if (d_out) cudaFree(d_out); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--grid-size=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024>(); Test<512>(); Test<256>(); Test<128>(); Test<64>(); Test<32>(); Test<16>(); return 0; } #else // < C++14 int main() {} #endif cub-2.0.1/examples/block/example_block_scan.cu000066400000000000000000000261011434614775400213540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockScan * * To compile using the command line: * nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide exclusive prefix sum over integers */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockScanAlgorithm ALGORITHM> __global__ void BlockPrefixSumKernel( int *d_in, // Tile of input int *d_out, // Tile of output clock_t *d_elapsed) // Elapsed cycle count of block scan { // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) typedef BlockLoad BlockLoadT; // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) typedef BlockStore BlockStoreT; // Specialize BlockScan type for our thread block typedef BlockScan BlockScanT; // Shared memory __shared__ union TempStorage { typename BlockLoadT::TempStorage load; typename BlockStoreT::TempStorage store; typename BlockScanT::TempStorage scan; } temp_storage; // Per-thread tile data int data[ITEMS_PER_THREAD]; // Load items into a blocked arrangement BlockLoadT(temp_storage.load).Load(d_in, data); // Barrier for smem reuse __syncthreads(); // Start cycle timer clock_t start = clock(); // Compute exclusive prefix sum int aggregate; BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate); // Stop cycle timer clock_t stop = clock(); // Barrier for smem reuse __syncthreads(); // Store items from a blocked arrangement BlockStoreT(temp_storage.store).Store(d_out, data); // Store aggregate and elapsed clocks if (threadIdx.x == 0) { *d_elapsed = (start > stop) ? start - stop : stop - start; d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize exclusive prefix sum problem (and solution). * Returns the aggregate */ int Initialize( int *h_in, int *h_reference, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; h_reference[i] = inclusive; inclusive += h_in[i]; } return inclusive; } /** * Test thread block scan */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockScanAlgorithm ALGORITHM> void Test() { const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays int *h_in = new int[TILE_SIZE]; int *h_reference = new int[TILE_SIZE]; int *h_gpu = new int[TILE_SIZE + 1]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); // Initialize device arrays int *d_in = NULL; int *d_out = NULL; clock_t *d_elapsed = NULL; cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE); cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1)); cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) printf("%d, ", h_in[i]); printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel, BLOCK_THREADS)); // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS", TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); // Run aggregate/prefix kernel BlockPrefixSumKernel<<>>( d_in, d_out, d_elapsed); // Check results printf("\tOutput items: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check total aggregate printf("\tAggregate: "); compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; clock_t elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); timer.Start(); // Run aggregate/prefix kernel BlockPrefixSumKernel<<>>( d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device clock_t clocks; CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; float avg_clocks = float(elapsed_clocks) / g_timing_iterations; float avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks); printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_gpu) delete[] h_gpu; if (d_in) cudaFree(d_in); if (d_out) cudaFree(d_out); if (d_elapsed) cudaFree(d_elapsed); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=]" "[--grid-size=]" "[--v] " "\n", argv[0], g_timing_iterations, g_grid_size); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_SCAN_RAKING>(); Test<512, 2, BLOCK_SCAN_RAKING>(); Test<256, 4, BLOCK_SCAN_RAKING>(); Test<128, 8, BLOCK_SCAN_RAKING>(); Test<64, 16, BLOCK_SCAN_RAKING>(); Test<32, 32, BLOCK_SCAN_RAKING>(); printf("-------------\n"); Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>(); printf("-------------\n"); Test<1024, 1, BLOCK_SCAN_WARP_SCANS>(); Test<512, 2, BLOCK_SCAN_WARP_SCANS>(); Test<256, 4, BLOCK_SCAN_WARP_SCANS>(); Test<128, 8, BLOCK_SCAN_WARP_SCANS>(); Test<64, 16, BLOCK_SCAN_WARP_SCANS>(); Test<32, 32, BLOCK_SCAN_WARP_SCANS>(); return 0; } cub-2.0.1/examples/cmake/000077500000000000000000000000001434614775400152005ustar00rootroot00000000000000cub-2.0.1/examples/cmake/CMakeLists.txt000066400000000000000000000005471434614775400177460ustar00rootroot00000000000000add_test( NAME cub.example.cmake.add_subdir COMMAND "${CMAKE_COMMAND}" --log-level=VERBOSE -G "${CMAKE_GENERATOR}" -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir" -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir" -D "CUB_ROOT=${CUB_SOURCE_DIR}" -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" ) cub-2.0.1/examples/cmake/add_subdir/000077500000000000000000000000001434614775400173005ustar00rootroot00000000000000cub-2.0.1/examples/cmake/add_subdir/CMakeLists.txt000066400000000000000000000015141434614775400220410ustar00rootroot00000000000000# This example demonstrates / tests adding CUB via a CMake add_subdirectory # call from a parent project. cmake_minimum_required(VERSION 3.15) # Silence warnings about empty CUDA_ARCHITECTURES properties on example targets: if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) cmake_policy(SET CMP0104 OLD) endif() project(CubAddSubDirExample CUDA) # Use your project's checkout of CUB here, for most cases # `add_subdirectory(cub)` will be sufficient. add_subdirectory("${CUB_ROOT}" cub) # Link the CUB::CUB target to your project's targets add_executable(HelloCUB dummy.cu) target_link_libraries(HelloCUB CUB::CUB) # # Validation # function(assert_target target_name) if (NOT TARGET "${target_name}") message(FATAL_ERROR "Target '${target_name}' not defined.") endif() endfunction() assert_target(CUB::CUB) assert_target(HelloCUB) cub-2.0.1/examples/cmake/add_subdir/dummy.cu000066400000000000000000000002011434614775400207550ustar00rootroot00000000000000#include #include int main() { std::cout << "Hello from CUB version " << CUB_VERSION << ":\n"; } cub-2.0.1/examples/device/000077500000000000000000000000001434614775400153575ustar00rootroot00000000000000cub-2.0.1/examples/device/.gitignore000066400000000000000000000001131434614775400173420ustar00rootroot00000000000000/bin /Debug /ipch /Release /cuda55.sdf /cuda55.suo /cuda60.sdf /cuda60.suo cub-2.0.1/examples/device/CMakeLists.txt000066400000000000000000000007311434614775400201200ustar00rootroot00000000000000file(GLOB_RECURSE example_srcs RELATIVE "${CMAKE_CURRENT_LIST_DIR}" CONFIGURE_DEPENDS example_*.cu ) foreach (cub_target IN LISTS CUB_TARGETS) foreach (example_src IN LISTS example_srcs) get_filename_component(example_name "${example_src}" NAME_WE) string(REGEX REPLACE "^example_device_" "device." example_name "${example_name}" ) cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) endforeach() endforeach() cub-2.0.1/examples/device/example_device_partition_flagged.cu000066400000000000000000000177551434614775400244430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DevicePartition::Flagged(). * * Partition flagged items from from a sequence of int keys using a * corresponding sequence of unsigned char flags. * * To compile using the command line: * nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting flags at distances of random length * chosen from [1..max_segment] */ void Initialize( int *h_in, unsigned char *h_flags, int num_items, int max_segment) { unsigned short max_short = (unsigned short) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_flags[j] = 0; h_in[j] = key; j++; } h_flags[i] = 1; i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("Flags:\n"); DisplayResults(h_flags, num_items); printf("\n\n"); } } /** * Solve unique problem */ int Solve( int *h_in, unsigned char *h_flags, int *h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (h_flags[i]) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; int *h_reference = new int[num_items]; unsigned char *h_flags = new unsigned char[num_items]; // Initialize problem and solution Initialize(h_in, h_flags, num_items, max_segment); int num_selected = Solve(h_in, h_flags, h_reference, num_items); printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int *d_in = NULL; unsigned char *d_flags = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_partition_if.cu000066400000000000000000000200541434614775400234320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DevicePartition::If(). * * Partitions items from from a sequence of int keys using a * section functor (greater-than) * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory /// Selection functor type struct GreaterThan { int compare; __host__ __device__ __forceinline__ GreaterThan(int compare) : compare(compare) {} __host__ __device__ __forceinline__ bool operator()(const int &a) const { return (a > compare); } }; //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting runs of random length chosen from [1..max_segment] */ void Initialize( int *h_in, int num_items, int max_segment) { int key = 0; int i = 0; while (i < num_items) { // Randomly select number of repeating occurrences uniformly from [1..max_segment] unsigned short max_short = (unsigned short) -1; unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_in[j] = key; j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ template int Solve( int *h_in, SelectOp select_op, int *h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (select_op(h_in[i])) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; int *h_reference = new int[num_items]; // DevicePartition a pivot index unsigned int pivot_index; unsigned int max_int = (unsigned int) -1; RandomBits(pivot_index); pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int)))); printf("Pivot idx: %d\n", pivot_index); fflush(stdout); // Initialize problem and solution Initialize(h_in, num_items, max_segment); GreaterThan select_op(h_in[pivot_index]); int num_selected = Solve(h_in, select_op, h_reference, num_items); printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_radix_sort.cu000066400000000000000000000200771434614775400231260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceRadixSort::SortPairs(). * * Sorts an array of float keys paired with a corresponding array of int values. * * To compile using the command line: * nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Simple key-value pairing for floating point types. Distinguishes * between positive and negative zero. */ struct Pair { float key; int value; bool operator<(const Pair &b) const { if (key < b.key) return true; if (key > b.key) return false; // Return true if key is negative zero and b.key is positive zero unsigned int key_bits = SafeBitCast(key); unsigned int b_key_bits = SafeBitCast(b.key); unsigned int HIGH_BIT = 1u << 31; return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0); } }; /** * Initialize key-value sorting problem. */ void Initialize( float *h_keys, int *h_values, float *h_reference_keys, int *h_reference_values, int num_items) { Pair *h_pairs = new Pair[num_items]; for (int i = 0; i < num_items; ++i) { RandomBits(h_keys[i]); RandomBits(h_values[i]); h_pairs[i].key = h_keys[i]; h_pairs[i].value = h_values[i]; } if (g_verbose) { printf("Input keys:\n"); DisplayResults(h_keys, num_items); printf("\n\n"); printf("Input values:\n"); DisplayResults(h_values, num_items); printf("\n\n"); } std::stable_sort(h_pairs, h_pairs + num_items); for (int i = 0; i < num_items; ++i) { h_reference_keys[i] = h_pairs[i].key; h_reference_values[i] = h_pairs[i].value; } delete[] h_pairs; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n", num_items, int(sizeof(float)), int(sizeof(int))); fflush(stdout); // Allocate host arrays float *h_keys = new float[num_items]; float *h_reference_keys = new float[num_items]; int *h_values = new int[num_items]; int *h_reference_values = new int[num_items]; // Initialize problem and solution on host Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items); // Allocate device arrays DoubleBuffer d_keys; DoubleBuffer d_values; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items)); // Allocate temporary storage size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Initialize device arrays CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Run CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose); printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS"); AssertEquals(0, compare); compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose); printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_keys) delete[] h_keys; if (h_reference_keys) delete[] h_reference_keys; if (h_values) delete[] h_values; if (h_reference_values) delete[] h_reference_values; if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0])); if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1])); if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0])); if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1])); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_reduce.cu000066400000000000000000000133031434614775400222110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceReduce::Sum(). * * Sums an array of int keys. * * To compile using the command line: * nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ void Initialize( int *h_in, int num_items) { for (int i = 0; i < num_items; ++i) h_in[i] = i; if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Compute solution */ void Solve( int *h_in, int &h_reference, int num_items) { for (int i = 0; i < num_items; ++i) { if (i == 0) h_reference = h_in[0]; else h_reference += h_in[i]; } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n", num_items, (int) sizeof(int)); fflush(stdout); // Allocate host arrays int* h_in = new int[num_items]; int h_reference{}; // Initialize problem and solution Initialize(h_in, num_items); Solve(h_in, h_reference, num_items); // Allocate problem device arrays int *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array int *d_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1)); // Request and allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_scan.cu000066400000000000000000000135671434614775400217020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceScan::ExclusiveSum(). * * Computes an exclusive sum of int keys. * * To compile using the command line: * nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ void Initialize( int *h_in, int num_items) { for (int i = 0; i < num_items; ++i) h_in[i] = i; if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve exclusive-scan problem */ int Solve( int *h_in, int *h_reference, int num_items) { int inclusive = 0; int aggregate = 0; for (int i = 0; i < num_items; ++i) { h_reference[i] = inclusive; inclusive += h_in[i]; aggregate += h_in[i]; } return aggregate; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n", num_items, (int) sizeof(int)); fflush(stdout); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; // Initialize problem and solution Initialize(h_in, num_items); Solve(h_in, h_reference, num_items); // Allocate problem device arrays int *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array int *d_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_select_flagged.cu000066400000000000000000000177341434614775400237060ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceSelect::Flagged(). * * Selects flagged items from from a sequence of int keys using a * corresponding sequence of unsigned char flags. * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting flags at distances of random length * chosen from [1..max_segment] */ void Initialize( int *h_in, unsigned char *h_flags, int num_items, int max_segment) { unsigned short max_short = (unsigned short) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_flags[j] = 0; h_in[j] = key; j++; } h_flags[i] = 1; i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("Flags:\n"); DisplayResults(h_flags, num_items); printf("\n\n"); } } /** * Solve unique problem */ int Solve( int *h_in, unsigned char *h_flags, int *h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (h_flags[i]) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; int *h_reference = new int[num_items]; unsigned char *h_flags = new unsigned char[num_items]; // Initialize problem and solution Initialize(h_in, h_flags, num_items, max_segment); int num_selected = Solve(h_in, h_flags, h_reference, num_items); printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int *d_in = NULL; unsigned char *d_flags = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_select_if.cu000066400000000000000000000200221434614775400226730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceSelect::If(). * * Selects items from from a sequence of int keys using a * section functor (greater-than) * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory /// Selection functor type struct GreaterThan { int compare; __host__ __device__ __forceinline__ GreaterThan(int compare) : compare(compare) {} __host__ __device__ __forceinline__ bool operator()(const int &a) const { return (a > compare); } }; //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting runs of random length chosen from [1..max_segment] */ void Initialize( int *h_in, int num_items, int max_segment) { int key = 0; int i = 0; while (i < num_items) { // Randomly select number of repeating occurrences uniformly from [1..max_segment] unsigned short max_short = (unsigned short) -1; unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_in[j] = key; j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ template int Solve( int *h_in, SelectOp select_op, int *h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (select_op(h_in[i])) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; int *h_reference = new int[num_items]; // Select a pivot index unsigned int pivot_index; unsigned int max_int = (unsigned int) -1; RandomBits(pivot_index); pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int)))); printf("Pivot idx: %d\n", pivot_index); fflush(stdout); // Initialize problem and solution Initialize(h_in, num_items, max_segment); GreaterThan select_op(h_in[pivot_index]); int num_selected = Solve(h_in, select_op, h_reference, num_items); printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_select_unique.cu000066400000000000000000000165051434614775400236160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceSelect::Unique(). * * Selects the first element from each run of identical values from a sequence * of int keys. * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting runs of random length chosen from [1..max_segment] */ void Initialize( int *h_in, int num_items, int max_segment) { int key = 0; int i = 0; while (i < num_items) { // Randomly select number of repeating occurrences uniformly from [1..max_segment] unsigned short max_short = (unsigned short) -1; unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_in[j] = key; j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ int Solve( int *h_in, int *h_reference, int num_items) { int num_selected = 0; if (num_items > 0) { h_reference[num_selected] = h_in[0]; num_selected++; } for (int i = 1; i < num_items; ++i) { if (h_in[i] != h_in[i - 1]) { h_reference[num_selected] = h_in[i]; num_selected++; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; // Initialize problem and solution Initialize(h_in, num_items, max_segment); int num_selected = Solve(h_in, h_reference, num_items); printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n", num_items, (int) sizeof(int), num_selected, num_items / num_selected); fflush(stdout); // Allocate problem device arrays int *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); return 0; } cub-2.0.1/examples/device/example_device_sort_find_non_trivial_runs.cu000066400000000000000000000317351434614775400264150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of sorting a sequence of keys and values (each pair is a * randomly-selected int32 paired with its original offset in the unsorted sequence), and then * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates. * * To compile using the command line: * nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Simple key-value pairing for using std::sort on key-value pairs. */ template struct Pair { Key key; Value value; bool operator<(const Pair &b) const { return (key < b.key); } }; /** * Pair ostream operator */ template std::ostream& operator<<(std::ostream& os, const Pair& val) { os << '<' << val.key << ',' << val.value << '>'; return os; } /** * Initialize problem */ template void Initialize( Key *h_keys, Value *h_values, int num_items, int max_key) { float scale = float(max_key) / float(UINT_MAX); for (int i = 0; i < num_items; ++i) { Key sample; RandomBits(sample); h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample); h_values[i] = i; } if (g_verbose) { printf("Keys:\n"); DisplayResults(h_keys, num_items); printf("\n\n"); printf("Values:\n"); DisplayResults(h_values, num_items); printf("\n\n"); } } /** * Solve sorted non-trivial subrange problem. Returns the number * of non-trivial runs found. */ template int Solve( Key *h_keys, Value *h_values, int num_items, int *h_offsets_reference, int *h_lengths_reference) { // Sort Pair *h_pairs = new Pair[num_items]; for (int i = 0; i < num_items; ++i) { h_pairs[i].key = h_keys[i]; h_pairs[i].value = h_values[i]; } std::stable_sort(h_pairs, h_pairs + num_items); if (g_verbose) { printf("Sorted pairs:\n"); DisplayResults(h_pairs, num_items); printf("\n\n"); } // Find non-trivial runs Key previous = h_pairs[0].key; int length = 1; int num_runs = 0; int run_begin = 0; for (int i = 1; i < num_items; ++i) { if (previous != h_pairs[i].key) { if (length > 1) { h_offsets_reference[num_runs] = run_begin; h_lengths_reference[num_runs] = length; num_runs++; } length = 1; run_begin = i; } else { length++; } previous = h_pairs[i].key; } if (length > 1) { h_offsets_reference[num_runs] = run_begin; h_lengths_reference[num_runs] = length; num_runs++; } delete[] h_pairs; return num_runs; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { typedef unsigned int Key; typedef int Value; int timing_iterations = 0; int num_items = 40; Key max_key = 20; // Max item // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxkey", max_key); args.GetCmdLineArgument("i", timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i= " "[--n= " "[--maxkey=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays (problem and reference solution) Key *h_keys = new Key[num_items]; Value *h_values = new Value[num_items]; int *h_offsets_reference = new int[num_items]; int *h_lengths_reference = new int[num_items]; // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs) printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key); fflush(stdout); Initialize(h_keys, h_values, num_items, max_key); int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference); printf("%d non-trivial runs\n", num_runs); fflush(stdout); // Repeat for performance timing GpuTimer gpu_timer; GpuTimer gpu_rle_timer; float elapsed_millis = 0.0; float elapsed_rle_millis = 0.0; for (int i = 0; i <= timing_iterations; ++i) { // Allocate and initialize device arrays for sorting DoubleBuffer d_keys; DoubleBuffer d_values; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items)); CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Start timer gpu_timer.Start(); // Allocate temporary storage for sorting size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Do the sort CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); // Free unused buffers and sorting temporary storage if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1])); if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1])); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Start timer gpu_rle_timer.Start(); // Allocate device arrays for enumerating non-trivial runs int *d_offests_out = NULL; int *d_lengths_out = NULL; int *d_num_runs = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1)); // Allocate temporary storage for isolating non-trivial runs d_temp_storage = NULL; CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns( d_temp_storage, temp_storage_bytes, d_keys.d_buffers[d_keys.selector], d_offests_out, d_lengths_out, d_num_runs, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Do the isolation CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns( d_temp_storage, temp_storage_bytes, d_keys.d_buffers[d_keys.selector], d_offests_out, d_lengths_out, d_num_runs, num_items)); // Free keys buffer if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector])); // // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys // // Stop sort timer gpu_timer.Stop(); gpu_rle_timer.Stop(); if (i == 0) { // First iteration is a warmup: // Check for correctness (and display results, if specified) printf("\nRUN OFFSETS: \n"); int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose); printf("\t\t %s ", compare ? "FAIL" : "PASS"); printf("\nRUN LENGTHS: \n"); compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose); printf("\t\t %s ", compare ? "FAIL" : "PASS"); printf("\nNUM RUNS: \n"); compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose); printf("\t\t %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } else { elapsed_millis += gpu_timer.ElapsedMillis(); elapsed_rle_millis += gpu_rle_timer.ElapsedMillis(); } // GPU cleanup if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector])); if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out)); if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out)); if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } // Host cleanup if (h_keys) delete[] h_keys; if (h_values) delete[] h_values; if (h_offsets_reference) delete[] h_offsets_reference; if (h_lengths_reference) delete[] h_lengths_reference; printf("\n\n"); if (timing_iterations > 0) { printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n", timing_iterations, elapsed_millis / timing_iterations, elapsed_rle_millis / timing_iterations); } return 0; } cub-2.0.1/test/000077500000000000000000000000001434614775400132615ustar00rootroot00000000000000cub-2.0.1/test/.gitignore000066400000000000000000000000341434614775400152460ustar00rootroot00000000000000/bin /link_main.obj /dummy/ cub-2.0.1/test/CMakeLists.txt000066400000000000000000000201041434614775400160160ustar00rootroot00000000000000# Some tests always build with RDC, so make sure that the sm_XX flags are # compatible. See note in CubCudaConfig.cmake. # TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback # tests to build for non-rdc arches. But for now, all files in a given directory # must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around # how CUDA_FLAGS works. set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}") # The function below reads the filepath `src`, extracts the %PARAM% comments, # and fills `labels_var` with a list of `label1_value1.label2_value2...` # strings, and puts the corresponding `DEFINITION=value1:DEFINITION=value2` # entries into `defs_var`. # # See the README.md file in this directory for background info. function(cub_get_test_params src labels_var defs_var) file(READ "${src}" file_data) set(param_regex "//[ ]+%PARAM%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)") string(REGEX MATCHALL "${param_regex}" matches "${file_data}" ) set(variant_labels) set(variant_defs) foreach(match IN LISTS matches) string(REGEX MATCH "${param_regex}" unused "${match}" ) set(def ${CMAKE_MATCH_1}) set(label ${CMAKE_MATCH_2}) set(values "${CMAKE_MATCH_3}") string(REPLACE ":" ";" values "${values}") # Build lists of test name suffixes (labels) and preprocessor definitions # (defs) containing the cartesian product of all param values: if (NOT variant_labels) foreach(value IN LISTS values) list(APPEND variant_labels ${label}_${value}) endforeach() else() set(tmp_labels) foreach(old_label IN LISTS variant_labels) foreach(value IN LISTS values) list(APPEND tmp_labels ${old_label}.${label}_${value}) endforeach() endforeach() set(variant_labels "${tmp_labels}") endif() if (NOT variant_defs) foreach(value IN LISTS values) list(APPEND variant_defs ${def}=${value}) endforeach() else() set(tmp_defs) foreach(old_def IN LISTS variant_defs) foreach(value IN LISTS values) list(APPEND tmp_defs ${old_def}:${def}=${value}) endforeach() endforeach() set(variant_defs "${tmp_defs}") endif() endforeach() set(${labels_var} "${variant_labels}" PARENT_SCOPE) set(${defs_var} "${variant_defs}" PARENT_SCOPE) endfunction() # Create meta targets that build all tests for a single configuration: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) set(config_meta_target ${config_prefix}.tests) add_custom_target(${config_meta_target}) add_dependencies(${config_prefix}.all ${config_meta_target}) endforeach() file(GLOB test_srcs RELATIVE ${CUB_SOURCE_DIR}/test CONFIGURE_DEPENDS test_*.cu ) ## cub_add_test # # Add a test executable and register it with ctest. # # target_name_var: Variable name to overwrite with the name of the test # target. Useful for post-processing target information. # test_name: The name of the test minus ".test." For example, # testing/vector.cu will be "vector", and testing/cuda/copy.cu will be # "cuda.copy". # test_src: The source file that implements the test. # cub_target: The reference cub target with configuration information. # function(cub_add_test target_name_var test_name test_src cub_target) cub_get_target_property(config_prefix ${cub_target} PREFIX) # The actual name of the test's target: set(test_target ${config_prefix}.test.${test_name}) set(${target_name_var} ${test_target} PARENT_SCOPE) # Related target names: set(config_meta_target ${config_prefix}.tests) set(test_meta_target cub.all.test.${test_name}) add_executable(${test_target} "${test_src}") target_link_libraries(${test_target} ${cub_target}) cub_clone_target_properties(${test_target} ${cub_target}) target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test") target_compile_definitions(${test_target} PRIVATE CUB_DEBUG_HOST_ASSERTIONS) # Add to the active configuration's meta target add_dependencies(${config_meta_target} ${test_target}) # Meta target that builds tests with this name for all configurations: if (NOT TARGET ${test_meta_target}) add_custom_target(${test_meta_target}) endif() add_dependencies(${test_meta_target} ${test_target}) add_test(NAME ${test_target} COMMAND "$" ) endfunction() # Sets out_var to 1 if the label contains cdp variants, regardless of whether # or not CDP is enabled in this particular variant. function(_cub_has_cdp_variant out_var label) string(FIND "${label}" "cdp_" idx) if (idx EQUAL -1) set(${out_var} 0 PARENT_SCOPE) else() set(${out_var} 1 PARENT_SCOPE) endif() endfunction() # Sets out_var to 1 if the label contains "cdp_1", e.g. cdp is explicitly # requested for this variant. function(_cub_is_cdp_enabled_variant out_var label) string(FIND "${label}" "cdp_1" idx) if (idx EQUAL -1) set(${out_var} 0 PARENT_SCOPE) else() set(${out_var} 1 PARENT_SCOPE) endif() endfunction() foreach (test_src IN LISTS test_srcs) get_filename_component(test_name "${test_src}" NAME_WE) string(REGEX REPLACE "^test_" "" test_name "${test_name}") cub_get_test_params("${test_src}" variant_labels variant_defs) list(LENGTH variant_labels num_variants) # Subtract 1 to support the inclusive endpoint of foreach(...RANGE...): math(EXPR range_end "${num_variants} - 1") # Verbose output: if (num_variants GREATER 0) message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':") foreach(var_idx RANGE ${range_end}) math(EXPR i "${var_idx} + 1") list(GET variant_labels ${var_idx} label) list(GET variant_defs ${var_idx} defs) message(VERBOSE " ${i}: ${test_name} ${label} ${defs}") endforeach() endif() foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) if (num_variants EQUAL 0) # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target}) if (CUB_ENABLE_TESTS_WITH_RDC) cub_enable_rdc_for_cuda_target(${test_target}) endif() else() # has variants: # Meta target to build all parametrizations of the current test for the # current CUB_TARGET config set(variant_meta_target ${config_prefix}.test.${test_name}.all) if (NOT TARGET ${variant_meta_target}) add_custom_target(${variant_meta_target}) endif() # Meta target to build all parametrizations of the current test for all # CUB_TARGET configs set(cub_variant_meta_target cub.all.test.${test_name}.all) if (NOT TARGET ${cub_variant_meta_target}) add_custom_target(${cub_variant_meta_target}) endif() # Generate multiple tests, one per variant. # See `cub_get_test_params` for details. foreach(var_idx RANGE ${range_end}) list(GET variant_labels ${var_idx} label) list(GET variant_defs ${var_idx} defs) string(REPLACE ":" ";" defs "${defs}") # Check if the test has explicit CDP variants: _cub_has_cdp_variant(explicit_cdp "${label}") _cub_is_cdp_enabled_variant(enable_cdp "${label}") cub_add_test(test_target ${test_name}.${label} "${test_src}" ${cub_target} ) add_dependencies(${variant_meta_target} ${test_target}) add_dependencies(${cub_variant_meta_target} ${test_target}) target_compile_definitions(${test_target} PRIVATE ${defs}) # Enable RDC if the test either: # 1. Explicitly requests it (cdp_1 label) # 2. Does not have an explicit CDP variant (no cdp_0 or cdp_1) but # RDC testing is globally enabled. # # Tests that explicitly request no cdp (cdp_0 label) should never enable # RDC. if (enable_cdp OR ((NOT explicit_cdp) AND CUB_ENABLE_TESTS_WITH_RDC)) cub_enable_rdc_for_cuda_target(${test_target}) endif() endforeach() # Variant endif() # Has variants endforeach() # CUB targets endforeach() # Source file add_subdirectory(cmake) cub-2.0.1/test/README.md000066400000000000000000000110241434614775400145360ustar00rootroot00000000000000# Test Parametrization Some of CUB's tests are very slow to build and are capable of exhausting RAM during compilation/linking. To avoid such issues, large tests are split into multiple executables to take advantage of parallel computation and reduce memory usage. CUB facilitates this by checking for special `%PARAM%` comments in each test's source code, and then uses this information to generate multiple executables with different configurations. ## Using `%PARAM%` The `%PARAM%` hint provides an automated method of generating multiple test executables from a single source file. To use it, add one or more special comments to the test source file: ```cpp // %PARAM% [definition] [label] [values] ``` CMake will parse the source file and extract these comments, using them to generate multiple test executables for the full cartesian product of values. - `definition` will be used as a preprocessor definition name. By convention, these begin with `TEST_`. - `label` is a short, human-readable label that will be used in the test executable's name to identify the test variant. - `values` is a colon-separated list of values used during test generation. Only numeric values have been tested. ## Special Labels ### CDP / RDC Testing If a `label` is `cdp`, it is assumed that the parameter is used to explicitly test variants built with and without CDP support. The `values` for such a parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1` indicating CDP enabled (RDC on). Tests that do not contain a variant labeled `cdp` will only enable RDC if the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true. ## Example For example, if `test_baz.cu` contains the following lines: ```cpp // %PARAM% TEST_FOO foo 0:1:2 // %PARAM% TEST_CDP cdp 0:1 ``` Six executables and CTest targets will be generated with unique definitions (only c++17 targets shown): | Executable Name | Preprocessor Definitions | RDC State | |----------------------------------|-----------------------------|-----------| | `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled | | `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled | | `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled | | `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled | | `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled | | `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled | ## Changing `%PARAM%` Hints Since CMake does not automatically reconfigure the build when source files are modified, CMake will need to be rerun manually whenever the `%PARAM%` comments change. ## Building and Running Split Tests CMake will generate individual build and test targets for each test variant, and also provides build "metatargets" that compile all variants of a given test. The variants follow the usual naming convention for CUB's tests, but include a suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above). ### Individual Test Variants Continuing with the `test_baz.cu` example, the test variant that uses `-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone: ```bash # Build a single variant: make cub.cpp17.test.baz.foo_1.bar_4 # Run a single variant bin/cub.cpp17.test.baz.foo_1.bar_4 # Run a single variant using CTest regex: ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4 ``` ### All Variants of a Test Using a metatarget and the proper regex, all variants of a test can be built and executed without listing all variants explicitly: ```bash # Build all variants using the `.all` metatarget make cub.cpp17.test.baz.all # Run all variants: ctest -R cub\.cpp17\.test\.baz\. ``` ## Debugging Running CMake with `--log-level=VERBOSE` will print out extra information about all detected test variants. ## Additional Info Ideally, only parameters that directly influence kernel template instantiations should be split out in this way. If changing a parameter doesn't change the kernel template type, the same kernel will be compiled into multiple executables. This defeats the purpose of splitting up the test since the compiler will generate redundant code across the new split executables. The best candidate parameters for splitting are input value types, rather than integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more infrastructure (data generation, validation) to be reused. Splitting other parameters can cause build times to increase since type-related infrastructure has to be rebuilt for each test variant. cub-2.0.1/test/bfloat16.h000066400000000000000000000162001434614775400150470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once /** * \file * Utilities for interacting with the opaque CUDA __nv_bfloat16 type */ #include #include #include #include #ifdef __GNUC__ // There's a ton of type-punning going on in this file. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif /****************************************************************************** * bfloat16_t ******************************************************************************/ /** * Host-based fp16 data type compatible and convertible with __nv_bfloat16 */ struct bfloat16_t { uint16_t __x; /// Constructor from __nv_bfloat16 __host__ __device__ __forceinline__ bfloat16_t(const __nv_bfloat16 &other) { __x = reinterpret_cast(other); } /// Constructor from integer __host__ __device__ __forceinline__ bfloat16_t(int a) { *this = bfloat16_t(float(a)); } /// Constructor from std::size_t __host__ __device__ __forceinline__ bfloat16_t(std::size_t a) { *this = bfloat16_t(float(a)); } /// Default constructor bfloat16_t() = default; /// Constructor from float __host__ __device__ __forceinline__ bfloat16_t(float a) { // Refrence: // https://github.com/pytorch/pytorch/blob/44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1/c10/util/BFloat16.h#L51 uint16_t ir; if (a != a) { ir = UINT16_C(0x7FFF); } else { union { uint32_t U32; float F32; }; F32 = a; uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); ir = static_cast((U32 + rounding_bias) >> 16); } this->__x = ir; } /// Cast to __nv_bfloat16 __host__ __device__ __forceinline__ operator __nv_bfloat16() const { return reinterpret_cast(__x); } /// Cast to float __host__ __device__ __forceinline__ operator float() const { float f = 0; uint32_t *p = reinterpret_cast(&f); *p = uint32_t(__x) << 16; return f; } /// Get raw storage __host__ __device__ __forceinline__ uint16_t raw() const { return this->__x; } /// Equality __host__ __device__ __forceinline__ bool operator ==(const bfloat16_t &other) const { return (this->__x == other.__x); } /// Inequality __host__ __device__ __forceinline__ bool operator !=(const bfloat16_t &other) const { return (this->__x != other.__x); } /// Assignment by sum __host__ __device__ __forceinline__ bfloat16_t& operator +=(const bfloat16_t &rhs) { *this = bfloat16_t(float(*this) + float(rhs)); return *this; } /// Multiply __host__ __device__ __forceinline__ bfloat16_t operator*(const bfloat16_t &other) { return bfloat16_t(float(*this) * float(other)); } /// Add __host__ __device__ __forceinline__ bfloat16_t operator+(const bfloat16_t &other) { return bfloat16_t(float(*this) + float(other)); } /// Less-than __host__ __device__ __forceinline__ bool operator<(const bfloat16_t &other) const { return float(*this) < float(other); } /// Less-than-equal __host__ __device__ __forceinline__ bool operator<=(const bfloat16_t &other) const { return float(*this) <= float(other); } /// Greater-than __host__ __device__ __forceinline__ bool operator>(const bfloat16_t &other) const { return float(*this) > float(other); } /// Greater-than-equal __host__ __device__ __forceinline__ bool operator>=(const bfloat16_t &other) const { return float(*this) >= float(other); } /// numeric_traits::max __host__ __device__ __forceinline__ static bfloat16_t (max)() { uint16_t max_word = 0x7F7F; return reinterpret_cast(max_word); } /// numeric_traits::lowest __host__ __device__ __forceinline__ static bfloat16_t lowest() { uint16_t lowest_word = 0xFF7F; return reinterpret_cast(lowest_word); } }; /****************************************************************************** * I/O stream overloads ******************************************************************************/ /// Insert formatted \p bfloat16_t into the output stream std::ostream& operator<<(std::ostream &out, const bfloat16_t &x) { out << (float)x; return out; } /// Insert formatted \p __nv_bfloat16 into the output stream std::ostream& operator<<(std::ostream &out, const __nv_bfloat16 &x) { return out << bfloat16_t(x); } /****************************************************************************** * Traits overloads ******************************************************************************/ template <> struct CUB_NS_QUALIFIER::FpLimits { static __host__ __device__ __forceinline__ bfloat16_t Max() { return bfloat16_t::max(); } static __host__ __device__ __forceinline__ bfloat16_t Lowest() { return bfloat16_t::lowest(); } }; template <> struct CUB_NS_QUALIFIER::NumericTraits : CUB_NS_QUALIFIER:: BaseTraits {}; #ifdef __GNUC__ #pragma GCC diagnostic pop #endif cub-2.0.1/test/cmake/000077500000000000000000000000001434614775400143415ustar00rootroot00000000000000cub-2.0.1/test/cmake/CMakeLists.txt000066400000000000000000000015031434614775400171000ustar00rootroot00000000000000if (NOT CUB_IN_THRUST) # Thrust has its own checks for this: # Test that we can use `find_package` on an installed CUB: add_test( NAME cub.test.cmake.test_install COMMAND "${CMAKE_COMMAND}" --log-level=VERBOSE -G "${CMAKE_GENERATOR}" -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install" -B "${CMAKE_CURRENT_BINARY_DIR}/test_install" -D "CUB_BINARY_DIR=${CUB_BINARY_DIR}" -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" ) endif() # Check source code for issues that can be found by pattern matching: add_test( NAME cub.test.cmake.check_source_files COMMAND "${CMAKE_COMMAND}" -D "CUB_SOURCE_DIR=${CUB_SOURCE_DIR}" -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake" ) cub-2.0.1/test/cmake/check_source_files.cmake000066400000000000000000000136531434614775400211720ustar00rootroot00000000000000# Check all source files for various issues that can be detected using pattern # matching. # # This is run as a ctest test named `cub.test.cmake.check_namespace`, or # manually with: # cmake -D "CUB_SOURCE_DIR=" -P check_namespace.cmake cmake_minimum_required(VERSION 3.15) function(count_substrings input search_regex output_var) string(REGEX MATCHALL "${search_regex}" matches "${input}") list(LENGTH matches num_matches) set(${output_var} ${num_matches} PARENT_SCOPE) endfunction() set(found_errors 0) file(GLOB_RECURSE cub_srcs RELATIVE "${CUB_SOURCE_DIR}" "${CUB_SOURCE_DIR}/cub/*.cuh" "${CUB_SOURCE_DIR}/cub/*.cu" "${CUB_SOURCE_DIR}/cub/*.h" "${CUB_SOURCE_DIR}/cub/*.cpp" ) ################################################################################ # Namespace checks. # Check all files in thrust to make sure that they use # CUB_NAMESPACE_BEGIN/END instead of bare `namespace cub {}` declarations. set(namespace_exclusions # This defines the macros and must have bare namespace declarations: cub/util_namespace.cuh ) set(bare_ns_regex "namespace[ \n\r\t]+cub[ \n\r\t]*\\{") # Validation check for the above regex: count_substrings([=[ namespace cub{ namespace cub { namespace cub { namespace cub { namespace cub { namespace cub { ]=] ${bare_ns_regex} valid_count) if (NOT valid_count EQUAL 6) message(FATAL_ERROR "Validation of bare namespace regex failed: " "Matched ${valid_count} times, expected 6.") endif() ################################################################################ # stdpar header checks. # Check all files in CUB to make sure that they aren't including # or , both of which will cause circular dependencies in nvc++'s # stdpar library. # # The headers following headers should be used instead: # -> # -> # set(stdpar_header_exclusions # Placeholder -- none yet. ) set(algorithm_regex "#[ \t]*include[ \t]+") set(memory_regex "#[ \t]*include[ \t]+") set(numeric_regex "#[ \t]*include[ \t]+") # Validation check for the above regex pattern: count_substrings([=[ #include # include #include # include # include // ... ]=] ${algorithm_regex} valid_count) if (NOT valid_count EQUAL 5) message(FATAL_ERROR "Validation of stdpar header regex failed: " "Matched ${valid_count} times, expected 5.") endif() ################################################################################ # Legacy macro checks. # Check all files in CUB to make sure that they aren't using the legacy # CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros. # # These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET. # They are provided for legacy purposes and should be replaced with # [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code. # # set(legacy_macro_header_exclusions # This header defines a legacy CUDART macro: cub/detail/detect_cuda_runtime.cuh ) set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED") set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__") ################################################################################ # Read source files: foreach(src ${cub_srcs}) file(READ "${CUB_SOURCE_DIR}/${src}" src_contents) if (NOT ${src} IN_LIST namespace_exclusions) count_substrings("${src_contents}" "${bare_ns_regex}" bare_ns_count) count_substrings("${src_contents}" CUB_NS_PREFIX prefix_count) count_substrings("${src_contents}" CUB_NS_POSTFIX postfix_count) count_substrings("${src_contents}" CUB_NAMESPACE_BEGIN begin_count) count_substrings("${src_contents}" CUB_NAMESPACE_END end_count) if (NOT bare_ns_count EQUAL 0) message("'${src}' contains 'namespace cub {...}'. Replace with CUB_NAMESPACE macros.") set(found_errors 1) endif() if (NOT prefix_count EQUAL 0) message("'${src}' contains 'CUB_NS_PREFIX'. Replace with CUB_NAMESPACE macros.") set(found_errors 1) endif() if (NOT postfix_count EQUAL 0) message("'${src}' contains 'CUB_NS_POSTFIX'. Replace with CUB_NAMESPACE macros.") set(found_errors 1) endif() if (NOT begin_count EQUAL end_count) message("'${src}' namespace macros are unbalanced:") message(" - CUB_NAMESPACE_BEGIN occurs ${begin_count} times.") message(" - CUB_NAMESPACE_END occurs ${end_count} times.") set(found_errors 1) endif() endif() if (NOT ${src} IN_LIST stdpar_header_exclusions) count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count) count_substrings("${src_contents}" "${memory_regex}" memory_count) count_substrings("${src_contents}" "${numeric_regex}" numeric_count) if (NOT algorithm_count EQUAL 0) message("'${src}' includes the header. Replace with .") set(found_errors 1) endif() if (NOT memory_count EQUAL 0) message("'${src}' includes the header. Replace with .") set(found_errors 1) endif() if (NOT numeric_count EQUAL 0) message("'${src}' includes the header. Replace with .") set(found_errors 1) endif() endif() if (NOT ${src} IN_LIST legacy_macro_header_exclusions) count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count) count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count) if (NOT thrust_count EQUAL 0) message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.") set(found_errors 1) endif() if (NOT cub_count EQUAL 0) message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.") set(found_errors 1) endif() endif() endforeach() if (NOT found_errors EQUAL 0) message(FATAL_ERROR "Errors detected.") endif() cub-2.0.1/test/cmake/test_install/000077500000000000000000000000001434614775400170465ustar00rootroot00000000000000cub-2.0.1/test/cmake/test_install/CMakeLists.txt000066400000000000000000000053141434614775400216110ustar00rootroot00000000000000# Test that an installation of the project can be located by find_package() call # with appropriate prefix settings. # # Expects CUB_BINARY_DIR to be set to an existing cub build directory. cmake_minimum_required(VERSION 3.15) project(CubTestInstall CXX CUDA) # This will eventually get deleted recursively -- keep that in mind if modifying set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/") function(do_manual_install) # Inspired by the VTK-m install tests, we can just glob up all of the # cmake_install.cmake, include (ie. run) them, and they'll effectively # install the project into the current value of CMAKE_INSTALL_PREFIX. # Gather all of the install files from CUB's root: file(GLOB_RECURSE install_files LIST_DIRECTORIES False "${CUB_BINARY_DIR}/cmake_install.cmake" ) message(STATUS "Locating install files...") foreach (install_file IN LISTS install_files) message(STATUS " * ${install_file}") endforeach() message(STATUS "Building install tree...") foreach(install_file IN LISTS install_files) include("${install_file}") endforeach() endfunction() function(do_cleanup) message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}") file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}") endfunction() function(assert_boolean var_name expect) if (expect) if (NOT ${var_name}) message(FATAL_ERROR "'${var_name}' is false, expected true.") endif() else() if (${var_name}) message(FATAL_ERROR "'${var_name}' is true, expected false.") endif() endif() endfunction() function(assert_target target_name) if (NOT TARGET "${target_name}") message(FATAL_ERROR "Target '${target_name}' not defined.") endif() endfunction() function(find_installed_project) set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}") find_package(CUB CONFIG) if (NOT CUB_FOUND) message(FATAL_ERROR "find_package(CUB) failed. " "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" ) endif() # Test some internal config vars to check that this is the expected install: # TODO The cmake_path (3.19) command will provide more robust ways to do this # Escape regex special characters in the install prefix, see # https://gitlab.kitware.com/cmake/cmake/-/issues/18580 string(REGEX REPLACE "([][+.*()^])" "\\\\\\1" prefix_regex "${CMAKE_INSTALL_PREFIX}" ) if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}") message(FATAL_ERROR "Found CUB in unexpected location: " " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} " " * ExpectedPrefix=${CMAKE_INSTALL_DIR}" ) endif() assert_target(CUB::CUB) endfunction() do_cleanup() # Prepare for new installation do_manual_install() find_installed_project() do_cleanup() # Clean up if successful cub-2.0.1/test/half.h000066400000000000000000000226671434614775400143610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once /** * \file * Utilities for interacting with the opaque CUDA __half type */ #include #include #include #include #include #ifdef __GNUC__ // There's a ton of type-punning going on in this file. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif /****************************************************************************** * half_t ******************************************************************************/ /** * Host-based fp16 data type compatible and convertible with __half */ struct half_t { uint16_t __x; /// Constructor from __half __host__ __device__ __forceinline__ half_t(const __half &other) { __x = reinterpret_cast(other); } /// Constructor from integer __host__ __device__ __forceinline__ half_t(int a) { *this = half_t(float(a)); } /// Constructor from std::size_t __host__ __device__ __forceinline__ half_t(std::size_t a) { *this = half_t(float(a)); } /// Default constructor half_t() = default; /// Constructor from float __host__ __device__ __forceinline__ half_t(float a) { // Stolen from Norbert Juffa uint32_t ia = *reinterpret_cast(&a); uint16_t ir; ir = (ia >> 16) & 0x8000; if ((ia & 0x7f800000) == 0x7f800000) { if ((ia & 0x7fffffff) == 0x7f800000) { ir |= 0x7c00; /* infinity */ } else { ir = 0x7fff; /* canonical NaN */ } } else if ((ia & 0x7f800000) >= 0x33000000) { int32_t shift = (int32_t) ((ia >> 23) & 0xff) - 127; if (shift > 15) { ir |= 0x7c00; /* infinity */ } else { ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */ if (shift < -14) { /* denormal */ ir |= ia >> (-1 - shift); ia = ia << (32 - (-1 - shift)); } else { /* normal */ ir |= ia >> (24 - 11); ia = ia << (32 - (24 - 11)); ir = static_cast(ir + ((14 + shift) << 10)); } /* IEEE-754 round to nearest of even */ if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) { ir++; } } } this->__x = ir; } /// Cast to __half __host__ __device__ __forceinline__ operator __half() const { return reinterpret_cast(__x); } /// Cast to float __host__ __device__ __forceinline__ operator float() const { // Stolen from Andrew Kerr int sign = ((this->__x >> 15) & 1); int exp = ((this->__x >> 10) & 0x1f); int mantissa = (this->__x & 0x3ff); std::uint32_t f = 0; if (exp > 0 && exp < 31) { // normal exp += 112; f = (sign << 31) | (exp << 23) | (mantissa << 13); } else if (exp == 0) { if (mantissa) { // subnormal exp += 113; while ((mantissa & (1 << 10)) == 0) { mantissa <<= 1; exp--; } mantissa &= 0x3ff; f = (sign << 31) | (exp << 23) | (mantissa << 13); } else if (sign) { f = 0x80000000; // negative zero } else { f = 0x0; // zero } } else if (exp == 31) { if (mantissa) { f = 0x7fffffff; // not a number } else { f = (0xff << 23) | (sign << 31); // inf } } static_assert(sizeof(float) == sizeof(std::uint32_t), "4-byte size check"); float ret{}; std::memcpy(&ret, &f, sizeof(float)); return ret; } /// Get raw storage __host__ __device__ __forceinline__ uint16_t raw() const { return this->__x; } /// Equality __host__ __device__ __forceinline__ bool operator ==(const half_t &other) const { return (this->__x == other.__x); } /// Inequality __host__ __device__ __forceinline__ bool operator !=(const half_t &other) const { return (this->__x != other.__x); } /// Assignment by sum __host__ __device__ __forceinline__ half_t& operator +=(const half_t &rhs) { *this = half_t(float(*this) + float(rhs)); return *this; } /// Multiply __host__ __device__ __forceinline__ half_t operator*(const half_t &other) { return half_t(float(*this) * float(other)); } /// Divide __host__ __device__ __forceinline__ half_t operator/(const half_t &other) const { return half_t(float(*this) / float(other)); } /// Add __host__ __device__ __forceinline__ half_t operator+(const half_t &other) { return half_t(float(*this) + float(other)); } /// Sub __host__ __device__ __forceinline__ half_t operator-(const half_t &other) const { return half_t(float(*this) - float(other)); } /// Less-than __host__ __device__ __forceinline__ bool operator<(const half_t &other) const { return float(*this) < float(other); } /// Less-than-equal __host__ __device__ __forceinline__ bool operator<=(const half_t &other) const { return float(*this) <= float(other); } /// Greater-than __host__ __device__ __forceinline__ bool operator>(const half_t &other) const { return float(*this) > float(other); } /// Greater-than-equal __host__ __device__ __forceinline__ bool operator>=(const half_t &other) const { return float(*this) >= float(other); } /// numeric_traits::max __host__ __device__ __forceinline__ static half_t (max)() { uint16_t max_word = 0x7BFF; return reinterpret_cast(max_word); } /// numeric_traits::lowest __host__ __device__ __forceinline__ static half_t lowest() { uint16_t lowest_word = 0xFBFF; return reinterpret_cast(lowest_word); } }; /****************************************************************************** * I/O stream overloads ******************************************************************************/ /// Insert formatted \p half_t into the output stream std::ostream& operator<<(std::ostream &out, const half_t &x) { out << (float)x; return out; } /// Insert formatted \p __half into the output stream std::ostream& operator<<(std::ostream &out, const __half &x) { return out << half_t(x); } /****************************************************************************** * Traits overloads ******************************************************************************/ template <> struct CUB_NS_QUALIFIER::FpLimits { static __host__ __device__ __forceinline__ half_t Max() { return (half_t::max)(); } static __host__ __device__ __forceinline__ half_t Lowest() { return half_t::lowest(); } }; template <> struct CUB_NS_QUALIFIER::NumericTraits : CUB_NS_QUALIFIER:: BaseTraits {}; #ifdef __GNUC__ #pragma GCC diagnostic pop #endif cub-2.0.1/test/link_a.cu000066400000000000000000000004731434614775400150530ustar00rootroot00000000000000#include void a() { printf("a() called\n"); cub::DoubleBuffer d_keys; cub::DoubleBuffer d_values; size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024); } cub-2.0.1/test/link_b.cu000066400000000000000000000004731434614775400150540ustar00rootroot00000000000000#include void b() { printf("b() called\n"); cub::DoubleBuffer d_keys; cub::DoubleBuffer d_values; size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024); } cub-2.0.1/test/link_main.cpp000066400000000000000000000001611434614775400157240ustar00rootroot00000000000000#include extern void a(); extern void b(); int main() { printf("hello world\n"); return 0; } cub-2.0.1/test/mersenne.h000066400000000000000000000122601434614775400152470ustar00rootroot00000000000000/* A C-program for MT19937, with initialization improved 2002/1/26. Coded by Takuji Nishimura and Makoto Matsumoto. Before using, initialize the state by using init_genrand(seed) or init_by_array(init_key, key_length). Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Any feedback is very welcome. http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) */ #include namespace mersenne { /* Period parameters */ const unsigned int N = 624; const unsigned int M = 397; const unsigned int MATRIX_A = 0x9908b0df; /* constant vector a */ const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */ const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */ static unsigned int mt[N]; /* the array for the state vector */ static int mti = N + 1; /* mti==N+1 means mt[N] is not initialized */ /* initializes mt[N] with a seed */ void init_genrand(unsigned int s) { mt[0] = s & 0xffffffff; for (mti = 1; mti < static_cast(N); mti++) { mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti); /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */ /* In the previous versions, MSBs of the seed affect */ /* only MSBs of the array mt[]. */ /* 2002/01/09 modified by Makoto Matsumoto */ mt[mti] &= 0xffffffff; /* for >32 bit machines */ } } /* initialize by an array with array-length */ /* init_key is the array for initializing keys */ /* key_length is its length */ /* slight change for C++, 2004/2/26 */ void init_by_array(unsigned int init_key[], int key_length) { int i, j, k; init_genrand(19650218); i = 1; j = 0; k = (static_cast(N) > key_length ? static_cast(N) : key_length); for (; k; k--) { mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525)) + init_key[j] + j; /* non linear */ mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */ i++; j++; if (i >= static_cast(N)) { mt[0] = mt[N - 1]; i = 1; } if (j >= key_length) j = 0; } for (k = N - 1; k; k--) { mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */ mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */ i++; if (i >= static_cast(N)) { mt[0] = mt[N - 1]; i = 1; } } mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */ } /* generates a random number on [0,0xffffffff]-interval */ unsigned int genrand_int32(void) { unsigned int y; static unsigned int mag01[2] = { 0x0, MATRIX_A }; /* mag01[x] = x * MATRIX_A for x=0,1 */ if (mti >= static_cast(N)) { /* generate N words at one time */ int kk; if (mti == N + 1) /* if init_genrand() has not been called, */ init_genrand(5489); /* a defat initial seed is used */ for (kk = 0; kk < static_cast(N - M); kk++) { y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1]; } for (; kk < static_cast(N - 1); kk++) { y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1]; } y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK); mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1]; mti = 0; } y = mt[mti++]; /* Tempering */ y ^= (y >> 11); y ^= (y << 7) & 0x9d2c5680; y ^= (y << 15) & 0xefc60000; y ^= (y >> 18); return y; } } // namespace mersenne cub-2.0.1/test/test_allocator.cu000066400000000000000000000401571434614775400166400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test evaluation for caching allocator of device memory ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=]" "[--bytes=]" "[--i=]" "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Get number of GPUs and current GPU int num_gpus; int initial_gpu; int timing_iterations = 10000; int timing_bytes = 1024 * 1024; if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1); if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1); args.GetCmdLineArgument("i", timing_iterations); args.GetCmdLineArgument("bytes", timing_bytes); // Create default allocator (caches up to 6MB in device allocations per GPU) CachingDeviceAllocator allocator; allocator.debug = true; printf("Running single-gpu tests...\n"); fflush(stdout); // // Test0 // // Create a new stream cudaStream_t other_stream; CubDebugExit(cudaStreamCreate(&other_stream)); // Allocate 999 bytes on the current gpu in stream0 char *d_999B_stream0_a; char *d_999B_stream0_b; CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); // Run some big kernel in stream 0 EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); // Free d_999B_stream0_a CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); // Allocate another 999 bytes in stream 0 CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Run some big kernel in stream 0 EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); // Free d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); // Allocate 999 bytes on the current gpu in other_stream char *d_999B_stream_other_a; char *d_999B_stream_other_b; allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream); // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle) AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have one cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // Run some big kernel in other_stream EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); // Free d_999B_stream_other CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); // Check that we can now use both allocations in stream 0 after synchronizing the device CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Free d_999B_stream0_a and d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); // Check that we can now use both allocations in other_stream CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Run some big kernel in other_stream EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); // Free d_999B_stream_other_a and d_999B_stream_other_b CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b)); // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(cudaStreamDestroy(other_stream)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Free d_999B_stream0_a and d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); // Free all cached CubDebugExit(allocator.FreeAllCached()); // // Test1 // // Allocate 5 bytes on the current gpu char *d_5B; CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5)); // Check that that we have zero free bytes cached on the initial GPU AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // // Test2 // // Allocate 4096 bytes on the current gpu char *d_4096B; CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // // Test3 // // DeviceFree d_5B CubDebugExit(allocator.DeviceFree(d_5B)); // Check that that we have min_bin_bytes free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have 1 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // // Test4 // // DeviceFree d_4096B CubDebugExit(allocator.DeviceFree(d_4096B)); // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096); // Check that that we have 0 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 0); // Check that that we have 2 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 2); // // Test5 // // Allocate 768 bytes on the current gpu char *d_768B; CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768)); // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused) AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have 1 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // // Test6 // // Allocate max_cached_bytes on the current gpu char *d_max_cached; CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes)); // DeviceFree d_max_cached CubDebugExit(allocator.DeviceFree(d_max_cached)); // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over) AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we still have 1 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // // Test7 // // Free all cached blocks on all GPUs CubDebugExit(allocator.FreeAllCached()); // Check that that we have 0 bytes cached on the initial GPU AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); // Check that that we have 0 cached blocks across all GPUs AssertEquals(allocator.cached_blocks.size(), 0); // Check that that still we have 1 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 1); // // Test8 // // Allocate max cached bytes + 1 on the current gpu char *d_max_cached_plus; CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1)); // DeviceFree max cached bytes CubDebugExit(allocator.DeviceFree(d_max_cached_plus)); // DeviceFree d_768B CubDebugExit(allocator.DeviceFree(d_768B)); unsigned int power; size_t rounded_bytes; allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768); // Check that that we have 4096 free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); // Check that that we have 1 cached blocks across all GPUs AssertEquals(allocator.cached_blocks.size(), 1); // Check that that still we have 0 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 0); // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not if (num_gpus > 1) { printf("\nRunning multi-gpu tests...\n"); fflush(stdout); // // Test9 // // Allocate 768 bytes on the next gpu int next_gpu = (initial_gpu + 1) % num_gpus; char *d_768B_2; CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); // DeviceFree d_768B on the next gpu CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); // Re-allocate 768 bytes on the next gpu CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); // Re-free d_768B on the next gpu CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); // Check that that we have 4096 free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); // Check that that we have 4096 free bytes cached on the second gpu AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes); // Check that that we have 2 cached blocks across all GPUs AssertEquals(allocator.cached_blocks.size(), 2); // Check that that still we have 0 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 0); } // // Performance // printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); fflush(stdout); fflush(stderr); // CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times CpuTimer cpu_timer; char *d_1024MB = NULL; allocator.debug = false; // Prime the caching allocator and the kernel CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); CubDebugExit(allocator.DeviceFree(d_1024MB)); cub::EmptyKernel<<<1, 32>>>(); // CUDA cpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); CubDebugExit(cudaFree(d_1024MB)); } cpu_timer.Stop(); float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis(); // CUB cpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); CubDebugExit(allocator.DeviceFree(d_1024MB)); } cpu_timer.Stop(); float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis(); printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, cuda_malloc_elapsed_millis / timing_iterations, cub_calloc_elapsed_millis / timing_iterations); // GPU performance comparisons. Allocate and free a 1MB block 2000 times GpuTimer gpu_timer; printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); fflush(stdout); fflush(stderr); // Kernel-only gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { cub::EmptyKernel<<<1, 32>>>(); } gpu_timer.Stop(); float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis(); // CUDA gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); cub::EmptyKernel<<<1, 32>>>(); CubDebugExit(cudaFree(d_1024MB)); } gpu_timer.Stop(); cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; // CUB gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); cub::EmptyKernel<<<1, 32>>>(); CubDebugExit(allocator.DeviceFree(d_1024MB)); } gpu_timer.Stop(); cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, cuda_malloc_elapsed_millis / timing_iterations, cub_calloc_elapsed_millis / timing_iterations); printf("Success\n"); return 0; } cub-2.0.1/test/test_block_adjacent_difference.cu000066400000000000000000000514311434614775400217520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockAdjacentDifference utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; template using CountingIteratorT = typename thrust::counting_iterator; /** * \brief Generates integer sequence \f$S_n=i(i-1)/2\f$. * * The adjacent difference of this sequence produce consecutive numbers: * \f[ * p = \frac{i(i - 1)}{2} \\ * n = \frac{(i + 1) i}{2} \\ * n - p = i \\ * \frac{(i + 1) i}{2} - \frac{i (i - 1)}{2} = i \\ * (i + 1) i - i (i - 1) = 2 i \\ * (i + 1) - (i - 1) = 2 \\ * 2 = 2 * \f] */ template struct TestSequenceGenerator { std::size_t offset; TestSequenceGenerator(std::size_t offset = 0) : offset(offset) {} template __device__ __host__ DestT operator()(SourceT index) const { index += static_cast(offset); return static_cast(index * (index - 1) / SourceT(2)); } }; struct CustomType { unsigned int key; unsigned int value; __device__ __host__ CustomType() : key(0) , value(0) {} __device__ __host__ CustomType(unsigned int key, unsigned int value) : key(key) , value(value) {} }; __device__ __host__ bool operator==(const CustomType& lhs, const CustomType& rhs) { return lhs.key == rhs.key && lhs.value == rhs.value; } __device__ __host__ bool operator!=(const CustomType& lhs, const CustomType& rhs) { return !(lhs == rhs); } __device__ __host__ CustomType operator-(const CustomType& lhs, const CustomType& rhs) { return CustomType{lhs.key - rhs.key, lhs.value - rhs.value}; } struct CustomDifference { template __device__ DataType operator()(DataType &lhs, DataType &rhs) { return lhs - rhs; } }; template __global__ void AdjDiffKernel(const DataType *input, DataType *output, ActionT action, bool in_place) { using BlockAdjacentDifferenceT = cub::BlockAdjacentDifference; __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; DataType thread_data[ItemsPerThread]; DataType thread_result[ItemsPerThread]; const unsigned int thread_offset = threadIdx.x * ItemsPerThread; for (unsigned int item = 0; item < ItemsPerThread; item++) { thread_data[item] = input[thread_offset + item]; } __syncthreads(); BlockAdjacentDifferenceT block_adj_diff(temp_storage); if (in_place) { action(thread_data, thread_data, block_adj_diff); for (unsigned int item = 0; item < ItemsPerThread; item++) { output[thread_offset + item] = thread_data[item]; } } else { action(thread_data, thread_result, block_adj_diff); for (unsigned int item = 0; item < ItemsPerThread; item++) { output[thread_offset + item] = thread_result[item]; } } } template void AdjDiffTest(const DataType *input, DataType *output, ActionT action, bool in_place = false) { AdjDiffKernel <<<1, ThreadsInBlock>>>(input, output, action, in_place); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template struct LastTileOpT { unsigned int m_valid_items{}; __host__ LastTileOpT(unsigned int valid_items) : m_valid_items(valid_items) {} template __device__ void operator()(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockAdjDiff &block_adj_diff) const { if (ReadLeft) { block_adj_diff.SubtractLeftPartialTile(input, output, CustomDifference(), m_valid_items); } else { block_adj_diff.SubtractRightPartialTile(input, output, CustomDifference(), m_valid_items); } } }; template struct MiddleTileOpT { DataType m_neighbour_tile_value; __host__ MiddleTileOpT(DataType neighbour_tile_value) : m_neighbour_tile_value(neighbour_tile_value) {} template __device__ void operator()(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockAdjDiff &block_adj_diff) const { if (ReadLeft) { block_adj_diff.SubtractLeft(input, output, CustomDifference(), m_neighbour_tile_value); } else { block_adj_diff.SubtractRight(input, output, CustomDifference(), m_neighbour_tile_value); } } }; template struct BaseOpT { template __device__ void operator()(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockAdjDiff &block_adj_diff) const { if (ReadLeft) { block_adj_diff.SubtractLeft(input, output, CustomDifference()); } else { block_adj_diff.SubtractRight(input, output, CustomDifference()); } } }; template void LastTileTest(const DataType *input, DataType *output, unsigned int valid_items, bool in_place) { AdjDiffTest(input, output, LastTileOpT{ valid_items}, in_place); } template struct LastTileWithPredOpT { unsigned int m_valid_items; DataType m_neighbour_tile_value; __host__ LastTileWithPredOpT( unsigned int valid_items, DataType neighbour_tile_value) : m_valid_items(valid_items) , m_neighbour_tile_value(neighbour_tile_value) { } template __device__ void operator()(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockAdjDiff &block_adj_diff) const { block_adj_diff.SubtractLeftPartialTile(input, output, CustomDifference(), m_valid_items, m_neighbour_tile_value); } }; template void LastTileWithPredTest(const DataType *input, DataType *output, unsigned int valid_items, DataType neighbour_tile_value, bool in_place) { AdjDiffTest( input, output, LastTileWithPredOpT{valid_items, neighbour_tile_value}, in_place); } template void Test(DataType *data, bool in_place) { AdjDiffTest( data, data, BaseOpT{}, in_place); } template void MiddleTileTest(const DataType *input, DataType *output, DataType neighbour_tile_value, bool in_place) { AdjDiffTest( input, output, MiddleTileOpT{neighbour_tile_value}, in_place); } template bool CheckResult(FirstIteratorT first_begin, FirstIteratorT first_end, SecondOperatorT second_begin) { auto err = thrust::mismatch(first_begin, first_end, second_begin); if (err.first != first_end) { return false; } return true; } template void TestLastTile(bool inplace, unsigned int num_items, thrust::device_vector &d_input) { thrust::tabulate(d_input.begin(), d_input.end(), TestSequenceGenerator{}); thrust::device_vector d_output(d_input.size()); constexpr bool read_left = true; constexpr bool read_right = false; DataType *d_input_ptr = thrust::raw_pointer_cast(d_input.data()); DataType *d_output_ptr = thrust::raw_pointer_cast(d_output.data()); LastTileTest( d_input_ptr, d_output_ptr, num_items, inplace); { AssertEquals(d_output.front(), d_input.front()); AssertTrue(CheckResult(d_output.begin() + 1, d_output.begin() + num_items, CountingIteratorT(DataType{0}))); AssertTrue(CheckResult(d_output.begin() + num_items, d_output.end(), d_input.begin() + num_items)); } if (num_items > 0) { LastTileWithPredTest( d_input_ptr + 1, d_output_ptr, num_items - 1, TestSequenceGenerator{}(0), inplace); AssertTrue(CheckResult(d_output.begin(), d_output.begin() + num_items - 1, CountingIteratorT(DataType{0}))); AssertTrue(CheckResult(d_output.begin() + num_items - 1, d_output.end() - 1, d_input.begin() + num_items)); } thrust::tabulate(d_input.begin(), d_input.end(), TestSequenceGenerator{}); LastTileTest( d_input_ptr, d_output_ptr, num_items, inplace); { thrust::device_vector reference(num_items); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(d_output.begin(), d_output.begin() + num_items - 1, reference.begin())); AssertTrue(CheckResult(d_output.begin() + num_items - 1, d_output.end(), d_input.begin() + num_items - 1)); } } template void TestMiddleTile(bool inplace, thrust::device_vector &d_input) { thrust::tabulate(d_input.begin(), d_input.end(), TestSequenceGenerator{std::size_t{1}}); thrust::device_vector d_output(d_input.size()); constexpr bool read_left = true; constexpr bool read_right = false; DataType *d_input_ptr = thrust::raw_pointer_cast(d_input.data()); DataType *d_output_ptr = thrust::raw_pointer_cast(d_output.data()); const DataType left_tile_last_value{0}; const DataType right_tile_first_value{ TestSequenceGenerator{}(d_input.size()) }; MiddleTileTest( d_input_ptr, d_output_ptr, left_tile_last_value, inplace); { AssertTrue(CheckResult(d_output.begin(), d_output.end(), CountingIteratorT(DataType{0}))); } thrust::tabulate(d_input.begin(), d_input.end(), TestSequenceGenerator{}); MiddleTileTest( d_input_ptr, d_output_ptr, right_tile_first_value, inplace); { thrust::device_vector reference(d_input.size()); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(d_output.begin(), d_output.end(), reference.begin())); } } struct IntToCustomType { unsigned int offset; IntToCustomType() : offset(0) {} explicit IntToCustomType(unsigned int offset) : offset(offset) {} __device__ __host__ CustomType operator()(unsigned int idx) const { return { idx + offset, idx + offset }; } }; template void TestFullTile(bool inplace, thrust::device_vector &d_data) { thrust::tabulate(d_data.begin(), d_data.end(), TestSequenceGenerator{}); constexpr bool read_left = true; constexpr bool read_right = false; DataType *d_data_ptr = thrust::raw_pointer_cast(d_data.data()); Test(d_data_ptr, inplace); { AssertEquals(d_data.front(), TestSequenceGenerator{}(0)); AssertTrue(CheckResult(d_data.begin() + 1, d_data.end(), CountingIteratorT(DataType{0}))); } thrust::tabulate(d_data.begin(), d_data.end(), TestSequenceGenerator{}); Test(d_data_ptr, inplace); { thrust::device_vector reference(d_data.size()); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(d_data.begin(), d_data.end() - 1, reference.begin())); AssertEquals(d_data.back(), TestSequenceGenerator{}(d_data.size() - 1)); } } template void TestCustomType(bool inplace, thrust::device_vector &d_data) { thrust::tabulate(d_data.begin(), d_data.end(), IntToCustomType{1}); CustomType *d_data_ptr = thrust::raw_pointer_cast(d_data.data()); constexpr bool read_left = true; constexpr bool read_right = false; Test(d_data_ptr, inplace); { const std::size_t expected_count = d_data.size(); const std::size_t actual_count = thrust::count(d_data.begin(), d_data.end(), CustomType{1, 1}); AssertEquals(expected_count, actual_count); } thrust::tabulate(d_data.begin(), d_data.end(), IntToCustomType{}); Test(d_data_ptr, inplace); { const auto unsigned_minus_one = static_cast(-1); const std::size_t expected_count = d_data.size() - 1; const std::size_t actual_count = thrust::count(d_data.begin(), d_data.end() - 1, CustomType{unsigned_minus_one, unsigned_minus_one}); AssertEquals(expected_count, actual_count); } } template < typename ValueType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> void Test(bool inplace) { constexpr int tile_size = ItemsPerThread * ThreadsInBlock; thrust::device_vector d_values(tile_size); for (unsigned int num_items = tile_size; num_items > 1; num_items /= 2) { TestLastTile(inplace, num_items, d_values); } TestFullTile(inplace, d_values); TestMiddleTile(inplace, d_values); } template void TestCustomType(bool inplace) { constexpr int tile_size = ItemsPerThread * ThreadsInBlock; thrust::device_vector d_values(tile_size); TestCustomType(inplace, d_values); } template void Test(bool inplace) { Test(inplace); Test(inplace); Test(inplace); Test(inplace); } template void Test(bool inplace) { Test(inplace); Test(inplace); } template void Test() { Test(false); Test(true); } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); Test<1>(); Test<2>(); Test<10>(); Test<15>(); // More of a compilation check TestCustomType<5, 256>(true); return 0; } cub-2.0.1/test/test_block_histogram.cu000066400000000000000000000227461434614775400200330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockHistogram utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include "test_util.h" #include #include #include #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * BlockHistogram test kernel. */ template < int BINS, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHistogramAlgorithm ALGORITHM, typename T, typename HistoCounter> __global__ void BlockHistogramKernel( T *d_samples, HistoCounter *d_histogram) { // Parameterize BlockHistogram type for our thread block typedef BlockHistogram BlockHistogram; // Allocate temp storage in shared memory __shared__ typename BlockHistogram::TempStorage temp_storage; // Per-thread tile data T data[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_samples, data); // Test histo (writing directly to histogram buffer in global) BlockHistogram(temp_storage).Histogram(data, d_histogram); } // WAR warning "pointless comparison of unsigned with zero" template typename std::enable_if::value, T>::type clamp_input(T val, int bins) { return val % bins; } template typename std::enable_if::value, T>::type clamp_input(T val, int bins) { return (val < 0 ? -val : val) % bins; } /** * Initialize problem (and solution) */ template < int BINS, typename SampleT> void Initialize( GenMode gen_mode, SampleT *h_samples, int *h_histograms_linear, int num_samples) { // Init bins for (int bin = 0; bin < BINS; ++bin) { h_histograms_linear[bin] = 0; } if (g_verbose) { printf("Samples: \n"); } // Initialize interleaved channel samples and histogram them correspondingly for (int i = 0; i < num_samples; ++i) { SampleT sample; InitValue(gen_mode, sample, i); sample = clamp_input(sample, BINS); if (g_verbose) { std::cout << CoutCast(sample) << ", "; } h_samples[i] = sample; h_histograms_linear[sample]++; } if (g_verbose) { printf("\n\n"); } } /** * Test BlockHistogram */ template < typename SampleT, int BINS, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHistogramAlgorithm ALGORITHM> void Test( GenMode gen_mode) { int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD; printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n", (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC", num_samples, typeid(SampleT).name(), (int) sizeof(SampleT), BINS, BLOCK_THREADS, (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS"); fflush(stdout); // Allocate host arrays SampleT *h_samples = new SampleT[num_samples]; int *h_reference = new int[BINS]; // Initialize problem Initialize(gen_mode, h_samples, h_reference, num_samples); // Allocate problem device arrays SampleT *d_samples = NULL; int *d_histogram = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * num_samples)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram, sizeof(int) * BINS)); // Initialize/clear device arrays CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS)); // Run kernel BlockHistogramKernel<<<1, BLOCK_THREADS>>>( d_samples, d_histogram); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose); printf("\t%s\n\n", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); fflush(stdout); fflush(stderr); // Cleanup if (h_samples) delete[] h_samples; if (h_reference) delete[] h_reference; if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples)); if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram)); // Correctness asserts AssertEquals(0, compare); } /** * Test different sample distributions */ template < typename SampleT, int BINS, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHistogramAlgorithm ALGORITHM> void Test() { Test(UNIFORM); Test(INTEGER_SEED); Test(RANDOM); } /** * Test different ALGORITHM */ template < typename SampleT, int BINS, int BLOCK_THREADS, int ITEMS_PER_THREAD> void Test() { Test(); Test(); } /** * Test different ITEMS_PER_THREAD */ template < typename SampleT, int BINS, int BLOCK_THREADS> void Test() { Test(); Test(); } /** * Test different BLOCK_THREADS */ template < typename SampleT, int BINS> void Test() { Test(); Test(); Test(); } /** * Test different BINS */ template void Test() { Test(); Test(); Test(); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); Test(); Test(); Test(); Test(); return 0; } cub-2.0.1/test/test_block_load_store.cu000066400000000000000000000437131434614775400201660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockLoad and BlockStore utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * Test load/store kernel. */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM, typename InputIteratorT, typename OutputIteratorT> __launch_bounds__ (BLOCK_THREADS, 1) __global__ void Kernel( InputIteratorT d_in, OutputIteratorT d_out_unguarded, OutputIteratorT d_out_guarded, int num_items) { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // The input value type using InputT = cub::detail::value_t; // The output value type using OutputT = cub::detail::non_void_value_t; // Threadblock load/store abstraction types using BlockLoad = BlockLoad; using BlockStore = BlockStore; // Shared memory type for this thread block union TempStorage { typename BlockLoad::TempStorage load; typename BlockStore::TempStorage store; }; // Allocate temp storage in shared memory __shared__ TempStorage temp_storage; // Threadblock work bounds int block_offset = blockIdx.x * TILE_SIZE; int guarded_elements = num_items - block_offset; // Tile of items OutputT data[ITEMS_PER_THREAD]; // Load data BlockLoad(temp_storage.load).Load(d_in + block_offset, data); __syncthreads(); // Store data BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data); __syncthreads(); // reset data #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) data[ITEM] = OutputT(); __syncthreads(); // Load data BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements); __syncthreads(); // Store data BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements); } //--------------------------------------------------------------------- // Host testing subroutines //--------------------------------------------------------------------- /** * Test load/store variants */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM, typename InputIteratorT, typename OutputIteratorT> void TestKernel( T *h_in, InputIteratorT d_in, OutputIteratorT d_out_unguarded_itr, OutputIteratorT d_out_guarded_itr, T *d_out_unguarded_ptr, T *d_out_guarded_ptr, int grid_size, int guarded_elements) { int compare; int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD; // Test with discard output iterator typedef typename std::iterator_traits::difference_type OffsetT; DiscardOutputIterator discard_itr; Kernel <<>>( d_in, discard_itr, discard_itr, guarded_elements); // Test with regular output iterator Kernel <<>>( d_in, d_out_unguarded_itr, d_out_guarded_itr, guarded_elements); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check results compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose); printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check results compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose); printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); } /** * Test native pointer. Specialized for sufficient resources */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM> void TestNative( int grid_size, float fraction_valid, Int2Type /*sufficient_resources*/) { int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD; int guarded_elements = int(fraction_valid * float(unguarded_elements)); // Allocate host arrays T *h_in = (T*) malloc(unguarded_elements * sizeof(T)); // Allocate device arrays T *d_in = NULL; T *d_out_unguarded = NULL; T *d_out_guarded = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements)); CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements)); CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements)); // Initialize problem on host and device for (int i = 0; i < unguarded_elements; ++i) { InitValue(INTEGER_SEED, h_in[i], i); } CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice)); printf("TestNative " "grid_size(%d) " "guarded_elements(%d) " "unguarded_elements(%d) " "BLOCK_THREADS(%d) " "ITEMS_PER_THREAD(%d) " "LOAD_ALGORITHM(%d) " "STORE_ALGORITHM(%d) " "sizeof(T)(%d)\n", grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T)); TestKernel( h_in, (T const *) d_in, // Test const d_out_unguarded, d_out_guarded, d_out_unguarded, d_out_guarded, grid_size, guarded_elements); // Cleanup if (h_in) free(h_in); if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded)); if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded)); } /** * Test native pointer. Specialized for insufficient resources */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM> void TestNative( int /*grid_size*/, float /*fraction_valid*/, Int2Type /*sufficient_resources*/) {} /** * Test iterator. Specialized for sufficient resources. */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM, CacheLoadModifier LOAD_MODIFIER, CacheStoreModifier STORE_MODIFIER> void TestIterator( int grid_size, float fraction_valid, Int2Type /*sufficient_resources*/) { int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD; int guarded_elements = int(fraction_valid * float(unguarded_elements)); // Allocate host arrays T *h_in = (T*) malloc(unguarded_elements * sizeof(T)); // Allocate device arrays T *d_in = NULL; T *d_out_unguarded = NULL; T *d_out_guarded = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements)); CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements)); CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements)); // Initialize problem on host and device for (int i = 0; i < unguarded_elements; ++i) { InitValue(INTEGER_SEED, h_in[i], i); } CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice)); printf("TestIterator " "grid_size(%d) " "guarded_elements(%d) " "unguarded_elements(%d) " "BLOCK_THREADS(%d) " "ITEMS_PER_THREAD(%d) " "LOAD_ALGORITHM(%d) " "STORE_ALGORITHM(%d) " "LOAD_MODIFIER(%d) " "STORE_MODIFIER(%d) " "sizeof(T)(%d)\n", grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T)); TestKernel( h_in, CacheModifiedInputIterator(d_in), CacheModifiedOutputIterator(d_out_unguarded), CacheModifiedOutputIterator(d_out_guarded), d_out_unguarded, d_out_guarded, grid_size, guarded_elements); // Cleanup if (h_in) free(h_in); if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded)); if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded)); } /** * Test iterator. Specialized for insufficient resources. */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM, CacheLoadModifier LOAD_MODIFIER, CacheStoreModifier STORE_MODIFIER> void TestIterator( int /*grid_size*/, float /*fraction_valid*/, Int2Type /*sufficient_resources*/) {} /** * Evaluate different pointer access types */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM> void TestPointerType( int grid_size, float fraction_valid) { // Threadblock load/store abstraction types typedef BlockLoad BlockLoad; typedef BlockStore BlockStore; static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 48; static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 48; static const bool sufficient_threads = BLOCK_THREADS <= 1024; static const bool sufficient_resources = sufficient_load_smem && sufficient_store_smem && sufficient_threads; TestNative(grid_size, fraction_valid, Int2Type()); TestIterator(grid_size, fraction_valid, Int2Type()); } /** * Evaluate different time-slicing strategies */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadAlgorithm LOAD_ALGORITHM, BlockStoreAlgorithm STORE_ALGORITHM> void TestSlicedStrategy( int grid_size, float fraction_valid) { TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); } /** * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32) */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD> void TestStrategy( int grid_size, float fraction_valid, Int2Type /*is_warp_multiple*/) { TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); } /** * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32) */ template < typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD> void TestStrategy( int grid_size, float fraction_valid, Int2Type /*is_warp_multiple*/) { TestStrategy(grid_size, fraction_valid, Int2Type()); TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); } /** * Evaluate different register blocking */ template < typename T, int BLOCK_THREADS> void TestItemsPerThread( int grid_size, float fraction_valid) { Int2Type is_warp_multiple; TestStrategy(grid_size, fraction_valid, is_warp_multiple); TestStrategy(grid_size, fraction_valid, is_warp_multiple); } /** * Evaluate different thread block sizes */ template void TestThreads( int grid_size, float fraction_valid) { TestItemsPerThread(grid_size, fraction_valid); TestItemsPerThread(grid_size, fraction_valid); TestItemsPerThread(grid_size, fraction_valid); TestItemsPerThread(grid_size, fraction_valid); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Get ptx version int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); // %PARAM% TEST_VALUE_TYPES types 0:1:2 // Compile/run thorough tests #if TEST_VALUE_TYPES == 0 TestThreads(2, 0.8f); TestThreads(2, 0.8f); TestThreads(2, 0.8f); #elif TEST_VALUE_TYPES == 1 TestThreads(2, 0.8f); TestThreads(2, 0.8f); #elif TEST_VALUE_TYPES == 2 TestThreads(2, 0.8f); TestThreads(2, 0.8f); #endif return 0; } cub-2.0.1/test/test_block_merge_sort.cu000066400000000000000000000305341434614775400201760ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockMergeSort utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; struct CustomType { std::uint8_t key; std::uint64_t count; __device__ __host__ CustomType() : key(0) , count(0) {} __device__ __host__ CustomType(std::uint64_t value) : key(static_cast(value)) , count(value) {} __device__ __host__ void operator=(std::uint64_t value) { key = static_cast(value); count = value; } }; struct CustomLess { template __device__ bool operator()(DataType &lhs, DataType &rhs) { return lhs < rhs; } __device__ bool operator()(CustomType &lhs, CustomType &rhs) { return lhs.key < rhs.key; } }; template < typename DataType, unsigned int ThreadsInBlock, unsigned int ItemsPerThread, bool Stable = false> __global__ void BlockMergeSortTestKernel(DataType *data, unsigned int valid_items) { using BlockMergeSort = cub::BlockMergeSort; __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle; DataType thread_data[ItemsPerThread]; const unsigned int thread_offset = threadIdx.x * ItemsPerThread; for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; thread_data[item] = idx < valid_items ? data[idx] : DataType(); } __syncthreads(); // Tests below use sequence to fill the data. // Therefore the following value should be greater than any that // is present in the input data. const DataType oob_default = static_cast(ThreadsInBlock * ItemsPerThread + 1); if (Stable) { if (valid_items == ThreadsInBlock * ItemsPerThread) { BlockMergeSort(temp_storage_shuffle).StableSort( thread_data, CustomLess()); } else { BlockMergeSort(temp_storage_shuffle).StableSort( thread_data, CustomLess(), valid_items, oob_default); } } else { if (valid_items == ThreadsInBlock * ItemsPerThread) { BlockMergeSort(temp_storage_shuffle).Sort( thread_data, CustomLess()); } else { BlockMergeSort(temp_storage_shuffle).Sort( thread_data, CustomLess(), valid_items, oob_default); } } for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; if (idx >= valid_items) break; data[idx] = thread_data[item]; } } template < typename KeyType, typename ValueType, unsigned int ThreadsInBlock, unsigned int ItemsPerThread, bool Stable = false> __global__ void BlockMergeSortTestKernel(KeyType *keys, ValueType *values, unsigned int valid_items) { using BlockMergeSort = cub::BlockMergeSort; __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle; KeyType thread_keys[ItemsPerThread]; ValueType thread_values[ItemsPerThread]; const unsigned int thread_offset = threadIdx.x * ItemsPerThread; for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; thread_keys[item] = idx < valid_items ? keys[idx] : KeyType(); thread_values[item] = idx < valid_items ? values[idx] : ValueType(); } __syncthreads(); // Tests below use sequence to fill the data. // Therefore the following value should be greater than any that // is present in the input data. const KeyType oob_default = ThreadsInBlock * ItemsPerThread + 1; if (Stable) { if (valid_items == ThreadsInBlock * ItemsPerThread) { BlockMergeSort(temp_storage_shuffle).StableSort( thread_keys, thread_values, CustomLess()); } else { BlockMergeSort(temp_storage_shuffle).StableSort( thread_keys, thread_values, CustomLess(), valid_items, oob_default); } } else { if (valid_items == ThreadsInBlock * ItemsPerThread) { BlockMergeSort(temp_storage_shuffle).Sort( thread_keys, thread_values, CustomLess()); } else { BlockMergeSort(temp_storage_shuffle).Sort( thread_keys, thread_values, CustomLess(), valid_items, oob_default); } } for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; if (idx >= valid_items) break; keys[idx] = thread_keys[item]; values[idx] = thread_values[item]; } } template< typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock, bool Stable = false> void BlockMergeSortTest(DataType *data, unsigned int valid_items) { BlockMergeSortTestKernel <<<1, ThreadsInBlock>>>(data, valid_items); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template< typename KeyType, typename ValueType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> void BlockMergeSortTest(KeyType *keys, ValueType *values, unsigned int valid_items) { BlockMergeSortTestKernel <<<1, ThreadsInBlock>>>(keys, values, valid_items); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template bool CheckResult(int num_items, thrust::device_vector &d_data, thrust::host_vector &h_data) { thrust::copy_n(d_data.begin(), num_items, h_data.begin()); for (int i = 0; i < num_items; i++) { if (h_data[i] != i) { return false; } } return true; } template < typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> void Test(unsigned int num_items, thrust::default_random_engine &rng, thrust::device_vector &d_data, thrust::host_vector &h_data) { thrust::sequence(d_data.begin(), d_data.end()); thrust::shuffle(d_data.begin(), d_data.end(), rng); BlockMergeSortTest( thrust::raw_pointer_cast(d_data.data()), num_items); AssertTrue(CheckResult(num_items, d_data, h_data)); } template < typename KeyType, typename ValueType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> void Test(unsigned int num_items, thrust::default_random_engine &rng, thrust::device_vector &d_keys, thrust::device_vector &d_values, thrust::host_vector &h_data) { thrust::sequence(d_keys.begin(), d_keys.end()); thrust::shuffle(d_keys.begin(), d_keys.end(), rng); thrust::copy_n(d_keys.begin(), num_items, d_values.begin()); BlockMergeSortTest( thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), num_items); AssertTrue(CheckResult(num_items, d_values, h_data)); } template < typename KeyType, typename ValueType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> void Test(thrust::default_random_engine &rng) { for (unsigned int num_items = ItemsPerThread * ThreadsInBlock; num_items > 1; num_items /= 2) { thrust::device_vector d_keys(num_items); thrust::device_vector d_values(num_items); thrust::host_vector h_keys(num_items); thrust::host_vector h_values(num_items); Test(num_items, rng, d_keys, h_keys); Test(num_items, rng, d_keys, d_values, h_values); } } template void Test(thrust::default_random_engine &rng) { Test(rng); Test(rng); // Mixed types Test(rng); Test(rng); } template void Test(thrust::default_random_engine &rng) { Test(rng); Test(rng); } struct CountToType { __device__ __host__ CustomType operator()(std::uint64_t val) { return { val }; } }; struct CountComparator { __device__ __host__ bool operator()(const CustomType &lhs, const CustomType &rhs) { if (lhs.key == rhs.key) return lhs.count < rhs.count; return lhs.key < rhs.key; } }; void TestStability() { constexpr unsigned int items_per_thread = 10; constexpr unsigned int threads_per_block = 128; constexpr unsigned int elements = items_per_thread * threads_per_block; constexpr bool stable = true; thrust::device_vector d_keys(elements); thrust::device_vector d_counts(elements); thrust::sequence(d_counts.begin(), d_counts.end()); thrust::transform(d_counts.begin(), d_counts.end(), d_keys.begin(), CountToType{}); // Sort keys BlockMergeSortTest( thrust::raw_pointer_cast(d_keys.data()), elements); // Check counts AssertTrue(thrust::is_sorted(d_keys.begin(), d_keys.end(), CountComparator{})); } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); thrust::default_random_engine rng; Test<1>(rng); Test<2>(rng); Test<10>(rng); Test<15>(rng); Test(rng); Test(rng); TestStability(); return 0; } cub-2.0.1/test/test_block_radix_sort.cu000066400000000000000000000554311434614775400202110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockRadixSort utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /// Specialized descending, blocked -> blocked template __device__ __forceinline__ void TestBlockSort( typename BlockRadixSort::TempStorage &temp_storage, Key (&keys)[ITEMS_PER_THREAD], Value (&values)[ITEMS_PER_THREAD], Key *d_keys, Value *d_values, int begin_bit, int end_bit, clock_t &stop, Int2Type is_descending, Int2Type is_blocked_output) { BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit); stop = clock(); StoreDirectBlocked(threadIdx.x, d_keys, keys); StoreDirectBlocked(threadIdx.x, d_values, values); } /// Specialized descending, blocked -> striped template __device__ __forceinline__ void TestBlockSort( typename BlockRadixSort::TempStorage &temp_storage, Key (&keys)[ITEMS_PER_THREAD], Value (&values)[ITEMS_PER_THREAD], Key *d_keys, Value *d_values, int begin_bit, int end_bit, clock_t &stop, Int2Type is_descending, Int2Type is_blocked_output) { BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit); stop = clock(); StoreDirectStriped(threadIdx.x, d_keys, keys); StoreDirectStriped(threadIdx.x, d_values, values); } /// Specialized ascending, blocked -> blocked template __device__ __forceinline__ void TestBlockSort( typename BlockRadixSort::TempStorage &temp_storage, Key (&keys)[ITEMS_PER_THREAD], Value (&values)[ITEMS_PER_THREAD], Key *d_keys, Value *d_values, int begin_bit, int end_bit, clock_t &stop, Int2Type is_descending, Int2Type is_blocked_output) { BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit); stop = clock(); StoreDirectBlocked(threadIdx.x, d_keys, keys); StoreDirectBlocked(threadIdx.x, d_values, values); } /// Specialized ascending, blocked -> striped template __device__ __forceinline__ void TestBlockSort( typename BlockRadixSort::TempStorage &temp_storage, Key (&keys)[ITEMS_PER_THREAD], Value (&values)[ITEMS_PER_THREAD], Key *d_keys, Value *d_values, int begin_bit, int end_bit, clock_t &stop, Int2Type is_descending, Int2Type is_blocked_output) { BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit); stop = clock(); StoreDirectStriped(threadIdx.x, d_keys, keys); StoreDirectStriped(threadIdx.x, d_values, values); } /** * BlockRadixSort kernel */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, cudaSharedMemConfig SMEM_CONFIG, int DESCENDING, int BLOCKED_OUTPUT, typename Key, typename Value> __launch_bounds__ (BLOCK_THREADS, 1) __global__ void Kernel( Key *d_keys, Value *d_values, int begin_bit, int end_bit, clock_t *d_elapsed) { // Threadblock load/store abstraction types typedef BlockRadixSort< Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT; // Allocate temp storage in shared memory __shared__ typename BlockRadixSortT::TempStorage temp_storage; // Items per thread Key keys[ITEMS_PER_THREAD]; Value values[ITEMS_PER_THREAD]; LoadDirectBlocked(threadIdx.x, d_keys, keys); LoadDirectBlocked(threadIdx.x, d_values, values); // Start cycle timer clock_t stop; clock_t start = clock(); TestBlockSort( temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type(), Int2Type()); // Store time if (threadIdx.x == 0) *d_elapsed = (start > stop) ? start - stop : stop - start; } //--------------------------------------------------------------------- // Host testing subroutines //--------------------------------------------------------------------- /** * Simple key-value pairing */ template < typename Key, typename Value> struct Pair { Key key; Value value; bool operator<(const Pair &b) const { return (key < b.key); } }; /** * Initialize key-value sorting problem. */ template void Initialize( GenMode gen_mode, Key *h_keys, Value *h_values, Key *h_reference_keys, Value *h_reference_values, int num_items, int entropy_reduction, int begin_bit, int end_bit) { (void)entropy_reduction; // unused Pair *h_pairs = new Pair[num_items]; for (int i = 0; i < num_items; ++i) { InitValue(gen_mode, h_keys[i], i); RandomBits(h_values[i]); // Mask off unwanted portions int num_bits = end_bit - begin_bit; if ((begin_bit > 0) || (end_bit < static_cast(sizeof(Key) * 8))) { unsigned long long base = 0; memcpy(&base, &h_keys[i], sizeof(Key)); base &= ((1ull << num_bits) - 1) << begin_bit; memcpy(&h_keys[i], &base, sizeof(Key)); } h_pairs[i].key = h_keys[i]; h_pairs[i].value = h_values[i]; } if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items); std::stable_sort(h_pairs, h_pairs + num_items); if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items); for (int i = 0; i < num_items; ++i) { h_reference_keys[i] = h_pairs[i].key; h_reference_values[i] = h_pairs[i].value; } delete[] h_pairs; } /** * Test BlockRadixSort kernel */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, cudaSharedMemConfig SMEM_CONFIG, bool DESCENDING, bool BLOCKED_OUTPUT, typename Key, typename Value> void TestDriver( GenMode gen_mode, int entropy_reduction, int begin_bit, int end_bit) { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, KEYS_ONLY = std::is_same::value, }; // Allocate host arrays Key *h_keys = new Key[TILE_SIZE]; Key *h_reference_keys = new Key[TILE_SIZE]; Value *h_values = new Value[TILE_SIZE]; Value *h_reference_values = new Value[TILE_SIZE]; // Allocate device arrays Key *d_keys = NULL; Value *d_values = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t))); // Initialize problem and solution on host Initialize(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values, TILE_SIZE, entropy_reduction, begin_bit, end_bit); // Copy problem to device CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice)); printf("%s " "BLOCK_THREADS(%d) " "ITEMS_PER_THREAD(%d) " "RADIX_BITS(%d) " "MEMOIZE_OUTER_SCAN(%d) " "INNER_SCAN_ALGORITHM(%d) " "SMEM_CONFIG(%d) " "DESCENDING(%d) " "BLOCKED_OUTPUT(%d) " "sizeof(Key)(%d) " "sizeof(Value)(%d) " "gen_mode(%d), " "entropy_reduction(%d) " "begin_bit(%d) " "end_bit(%d), " "samples(%d)\n", ((KEYS_ONLY) ? "Keys-only" : "Key-value"), BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, (int) sizeof(Key), (int) sizeof(Value), gen_mode, entropy_reduction, begin_bit, end_bit, g_num_rand_samples); // Set shared memory config cudaDeviceSetSharedMemConfig(SMEM_CONFIG); // Run kernel Kernel<<<1, BLOCK_THREADS>>>( d_keys, d_values, begin_bit, end_bit, d_elapsed); // Flush kernel output / errors CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check keys results printf("\tKeys: "); int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check value results if (!KEYS_ONLY) { printf("\tValues: "); compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } printf("\n"); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); printf("\n"); // Cleanup if (h_keys) delete[] h_keys; if (h_reference_keys) delete[] h_reference_keys; if (h_values) delete[] h_values; if (h_reference_values) delete[] h_reference_values; if (d_keys) CubDebugExit(g_allocator.DeviceFree(d_keys)); if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Test driver (valid tile size <= MAX_SMEM_BYTES) */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, cudaSharedMemConfig SMEM_CONFIG, bool DESCENDING, bool BLOCKED_OUTPUT, typename Key, typename Value> void TestValid(Int2Type /*fits_smem_capacity*/) { // Iterate begin_bit for (int begin_bit = 0; begin_bit <= 1; begin_bit++) { // Iterate end bit for (int end_bit = begin_bit + 1; end_bit <= static_cast(sizeof(Key) * 8); end_bit = end_bit * 2 + begin_bit) { // Uniform key distribution TestDriver( UNIFORM, 0, begin_bit, end_bit); // Sequential key distribution TestDriver( INTEGER_SEED, 0, begin_bit, end_bit); // Iterate random with entropy_reduction for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3) { TestDriver( RANDOM, entropy_reduction, begin_bit, end_bit); } // For floating-point keys, test random keys mixed with -0.0 and +0.0 if (cub::Traits::CATEGORY == cub::FLOATING_POINT) { TestDriver( RANDOM_MINUS_PLUS_ZERO, 0, begin_bit, end_bit); } } } } /** * Test driver (invalid tile size) */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, cudaSharedMemConfig SMEM_CONFIG, bool DESCENDING, bool BLOCKED_OUTPUT, typename Key, typename Value> void TestValid(Int2Type fits_smem_capacity) {} /** * Test ascending/descending and to-blocked/to-striped */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, cudaSharedMemConfig SMEM_CONFIG, typename Key, typename Value> void Test() { // Check size of smem storage for the target arch to make sure it will fit typedef BlockRadixSort BlockRadixSortT; Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity; // Sort-descending, to-striped TestValid(fits_smem_capacity); // Sort-ascending, to-blocked TestValid(fits_smem_capacity); // Not necessary // TestValid(fits_smem_capacity); // TestValid(fits_smem_capacity); } /** * Test value type and smem config */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, typename Key> void TestKeys() { // Test keys-only sorting with both smem configs Test(); // Keys-only (4-byte smem bank config) Test(); // Keys-only (8-byte smem bank config) } /** * Test value type and smem config */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM, typename Key> void TestKeysAndPairs() { // Test pairs sorting with only 4-byte configs Test(); // With small-values Test(); // With same-values Test(); // With large values } /** * Test key type */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN, BlockScanAlgorithm INNER_SCAN_ALGORITHM> void Test() { // %PARAM% TEST_VALUE_TYPES types 0:1:2:3 #if TEST_VALUE_TYPES == 0 // Test unsigned types with keys-only TestKeys(); TestKeys(); #elif TEST_VALUE_TYPES == 1 TestKeys(); TestKeys(); #elif TEST_VALUE_TYPES == 2 // Test signed and fp types with paired values TestKeysAndPairs(); TestKeysAndPairs(); TestKeysAndPairs(); #elif TEST_VALUE_TYPES == 3 TestKeysAndPairs(); TestKeysAndPairs(); TestKeysAndPairs(); #endif } /** * Test inner scan algorithm */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, bool MEMOIZE_OUTER_SCAN> void Test() { Test(); Test(); } /** * Test outer scan algorithm */ template void Test() { Test(); Test(); } // Dispatch RADIX_BITS template void Test() { Test(); Test(); } // Dispatch ITEMS_PER_THREAD template void Test() { Test(); Test(); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); Test<32>(); Test<160>(); return 0; } cub-2.0.1/test/test_block_reduce.cu000066400000000000000000000572631434614775400173070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockReduce utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /// Generic reduction (full, 1) template __device__ __forceinline__ T DeviceTest( BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op) { return block_reduce.Reduce(data[0], reduction_op); } /// Generic reduction (full, ITEMS_PER_THREAD) template __device__ __forceinline__ T DeviceTest( BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op) { return block_reduce.Reduce(data, reduction_op); } /// Generic reduction (partial, 1) template __device__ __forceinline__ T DeviceTest( BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads) { return block_reduce.Reduce(data, reduction_op, valid_threads); } /// Sum reduction (full, 1) template __device__ __forceinline__ T DeviceTest( BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op) { return block_reduce.Sum(data[0]); } /// Sum reduction (full, ITEMS_PER_THREAD) template __device__ __forceinline__ T DeviceTest( BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op) { return block_reduce.Sum(data); } /// Sum reduction (partial, 1) template __device__ __forceinline__ T DeviceTest( BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads) { return block_reduce.Sum(data, valid_threads); } /** * Test full-tile reduction kernel (where num_items is an even * multiple of BLOCK_THREADS) */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, typename T, typename ReductionOp> __launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) __global__ void FullTileReduceKernel( T *d_in, T *d_out, ReductionOp reduction_op, int tiles, clock_t *d_elapsed) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Cooperative thread block reduction utility type (returns aggregate in thread 0) typedef BlockReduce BlockReduceT; // Allocate temp storage in shared memory __shared__ typename BlockReduceT::TempStorage temp_storage; int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); // Per-thread tile data T data[ITEMS_PER_THREAD]; // Load first tile of data int block_offset = 0; if (block_offset < TILE_SIZE * tiles) { LoadDirectBlocked(linear_tid, d_in + block_offset, data); block_offset += TILE_SIZE; // Start cycle timer clock_t start = clock(); // Cooperative reduce first tile BlockReduceT block_reduce(temp_storage) ; T block_aggregate = DeviceTest(block_reduce, data, reduction_op); // Stop cycle timer clock_t stop = clock(); clock_t elapsed = (start > stop) ? start - stop : stop - start; // Loop over input tiles while (block_offset < TILE_SIZE * tiles) { // TestBarrier between thread block reductions __syncthreads(); // Load tile of data LoadDirectBlocked(linear_tid, d_in + block_offset, data); block_offset += TILE_SIZE; // Start cycle timer clock_t start = clock(); // Cooperatively reduce the tile's aggregate BlockReduceT block_reduce(temp_storage) ; T tile_aggregate = DeviceTest(block_reduce, data, reduction_op); // Stop cycle timer clock_t stop = clock(); elapsed += (start > stop) ? start - stop : stop - start; // Reduce thread block aggregate block_aggregate = reduction_op(block_aggregate, tile_aggregate); } // Store data if (linear_tid == 0) { d_out[0] = block_aggregate; *d_elapsed = elapsed; } } } /** * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS) */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, typename T, typename ReductionOp> __launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) __global__ void PartialTileReduceKernel( T *d_in, T *d_out, int num_items, ReductionOp reduction_op, clock_t *d_elapsed) { // Cooperative thread block reduction utility type (returns aggregate only in thread-0) typedef BlockReduce BlockReduceT; // Allocate temp storage in shared memory __shared__ typename BlockReduceT::TempStorage temp_storage; int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); // Per-thread tile data T partial; // Load partial tile data if (linear_tid < num_items) { partial = d_in[linear_tid]; } // Start cycle timer clock_t start = clock(); // Cooperatively reduce the tile's aggregate BlockReduceT block_reduce(temp_storage) ; T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items); // Stop cycle timer clock_t stop = clock(); clock_t elapsed = (start > stop) ? start - stop : stop - start; // Store data if (linear_tid == 0) { d_out[0] = tile_aggregate; *d_elapsed = elapsed; } } //--------------------------------------------------------------------- // Host utility subroutines //--------------------------------------------------------------------- /** * Initialize problem (and solution) */ template < typename T, typename ReductionOp> void Initialize( GenMode gen_mode, T *h_in, T h_reference[1], ReductionOp reduction_op, int num_items) { for (int i = 0; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); if (i == 0) h_reference[0] = h_in[0]; else h_reference[0] = static_cast(reduction_op(h_reference[0], h_in[i])); } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n"); } } //--------------------------------------------------------------------- // Full tile test generation //--------------------------------------------------------------------- /** * Test full-tile reduction. (Specialized for sufficient resources) */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, typename T, typename ReductionOp> void TestFullTile( GenMode gen_mode, int tiles, ReductionOp reduction_op, Int2Type /*sufficient_resources*/) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; int num_items = TILE_SIZE * tiles; // Allocate host arrays T *h_in = new T[num_items]; T h_reference[1]; // Initialize problem Initialize(gen_mode, h_in, h_reference, reduction_op, num_items); // Initialize/clear device arrays T *d_in = NULL; T *d_out = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1)); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1)); // Test multi-tile (unguarded) printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n", std::is_same::value ? "Sum" : "Max", (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS", gen_mode, num_items, BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, tiles, typeid(T).name(), (int) sizeof(T)); fflush(stdout); dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); FullTileReduceKernel<<<1, block_dims>>>( d_in, d_out, reduction_op, tiles, d_elapsed); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tReduction results: "); int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Cleanup if (h_in) delete[] h_in; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Test full-tile reduction. (Specialized for insufficient resources) */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, typename T, typename ReductionOp> void TestFullTile( GenMode gen_mode, int tiles, ReductionOp reduction_op, Int2Type sufficient_resources) {} /** * Test full-tile reduction. */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, typename T, typename ReductionOp> void TestFullTile( GenMode gen_mode, int tiles, ReductionOp reduction_op) { // Check size of smem storage for the target arch to make sure it will fit typedef BlockReduce BlockReduceT; enum { sufficient_smem = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024), sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024), }; TestFullTile(gen_mode, tiles, reduction_op, Int2Type()); } /** * Run battery of tests for different thread block dimensions */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename T, typename ReductionOp> void TestFullTile( GenMode gen_mode, int tiles, ReductionOp reduction_op) { TestFullTile(gen_mode, tiles, reduction_op); TestFullTile(gen_mode, tiles, reduction_op); } /** * Run battery of tests for different thread items */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_THREADS, typename T, typename ReductionOp> void TestFullTile( GenMode gen_mode, int tiles, ReductionOp reduction_op) { TestFullTile(gen_mode, tiles, reduction_op); TestFullTile(gen_mode, tiles, reduction_op); } /** * Run battery of full-tile tests for different numbers of tiles */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_THREADS, typename T, typename ReductionOp> void TestFullTile( GenMode gen_mode, ReductionOp reduction_op) { for (int tiles = 1; tiles < 3; tiles++) { TestFullTile(gen_mode, tiles, reduction_op); } } //--------------------------------------------------------------------- // Partial-tile test generation //--------------------------------------------------------------------- /** * Test partial-tile reduction. (Specialized for sufficient resources) */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, typename T, typename ReductionOp> void TestPartialTile( GenMode gen_mode, int num_items, ReductionOp reduction_op, Int2Type /*sufficient_resources*/) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS; // Allocate host arrays T *h_in = new T[num_items]; T h_reference[1]; // Initialize problem Initialize(gen_mode, h_in, h_reference, reduction_op, num_items); // Initialize/clear device arrays T *d_in = NULL; T *d_out = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1)); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1)); printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n", (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS", gen_mode, num_items, BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, typeid(T).name(), (int) sizeof(T)); fflush(stdout); dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); PartialTileReduceKernel<<<1, block_dims>>>( d_in, d_out, num_items, reduction_op, d_elapsed); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tReduction results: "); int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Cleanup if (h_in) delete[] h_in; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Test partial-tile reduction (specialized for insufficient resources) */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, typename T, typename ReductionOp> void TestPartialTile( GenMode gen_mode, int num_items, ReductionOp reduction_op, Int2Type sufficient_resources) {} /** * Run battery of partial-tile tests for different numbers of effective threads and thread dimensions */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, typename T, typename ReductionOp> void TestPartialTile( GenMode gen_mode, int num_items, ReductionOp reduction_op) { // Check size of smem storage for the target arch to make sure it will fit typedef BlockReduce BlockReduceT; enum { sufficient_smem = sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024, sufficient_threads = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024, }; TestPartialTile(gen_mode, num_items, reduction_op, Int2Type()); } /** * Run battery of partial-tile tests for different numbers of effective threads and thread dimensions */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_THREADS, typename T, typename ReductionOp> void TestPartialTile( GenMode gen_mode, ReductionOp reduction_op) { for ( int num_items = 1; num_items < BLOCK_THREADS; num_items += CUB_MAX(1, BLOCK_THREADS / 5)) { TestPartialTile(gen_mode, num_items, reduction_op); TestPartialTile(gen_mode, num_items, reduction_op); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Run battery of full-tile tests for different gen modes */ template < BlockReduceAlgorithm ALGORITHM, int BLOCK_THREADS, typename T, typename ReductionOp> void Test( ReductionOp reduction_op) { TestFullTile(UNIFORM, reduction_op); TestPartialTile(UNIFORM, reduction_op); TestFullTile(INTEGER_SEED, reduction_op); TestPartialTile(INTEGER_SEED, reduction_op); if (Traits::CATEGORY != FLOATING_POINT) { // Don't test randomly-generated floats b/c of stability TestFullTile(RANDOM, reduction_op); TestPartialTile(RANDOM, reduction_op); } } /** * Run battery of tests for different block-reduction algorithmic variants */ template < int BLOCK_THREADS, typename T, typename ReductionOp> void Test( ReductionOp reduction_op) { Test(reduction_op); Test(reduction_op); Test(reduction_op); } /** * Run battery of tests for different block sizes */ template < typename T, typename ReductionOp> void Test( ReductionOp reduction_op) { Test<7, T>(reduction_op); Test<32, T>(reduction_op); Test<65, T>(reduction_op); Test<128, T>(reduction_op); } /** * Run battery of tests for different block sizes */ template void Test() { Test(Sum()); Test(Max()); } template void Test() { Test(Sum()); Test(Max()); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // %PARAM% TEST_VALUE_TYPES types 0:1:2:3 // primitives #if TEST_VALUE_TYPES == 0 Test(); Test(); Test(); Test(); #elif TEST_VALUE_TYPES == 1 Test(); Test(); // vector types Test(); Test(); #elif TEST_VALUE_TYPES == 2 Test(); Test(); Test(); Test(); #elif TEST_VALUE_TYPES == 3 Test(); Test(); // Complex types Test(); Test(); #endif return 0; } cub-2.0.1/test/test_block_run_length_decode.cu000066400000000000000000000706631434614775400215070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; /****************************************************************************** * HELPER CLASS FOR RUN-LENGTH DECODING TESTS ******************************************************************************/ /** * \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its template parameter * specialisations. * * \tparam ItemItT The item type being run-length decoded * \tparam RunLengthsItT Iterator type providing the runs' lengths * \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to * \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is decoding * \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's relative offset within its run * \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's offset instead of each run's length * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension */ template class AgentTestBlockRunLengthDecode { public: constexpr static uint32_t BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; constexpr static uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * BLOCK_THREADS; constexpr static bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_; private: using RunItemT = cub::detail::value_t; using RunLengthT = cub::detail::value_t; using BlockRunOffsetScanT = cub::BlockScan; using BlockRunLengthDecodeT = cub::BlockRunLengthDecode; using BlockLoadRunItemT = cub::BlockLoad; using BlockLoadRunLengthsT = cub::BlockLoad; using BlockStoreDecodedItemT = cub::BlockStore; using BlockStoreRelativeOffsetT = cub::BlockStore; __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], RunLengthT &decoded_size, cub::Int2Type /*test_run_offsets*/) { RunLengthT run_offsets[RUNS_PER_THREAD]; BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size); // Ensure temporary shared memory can be repurposed CTA_SYNC(); // Construct BlockRunLengthDecode and initialize with the run offsets return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets); } __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], RunLengthT &decoded_size, cub::Int2Type /*test_run_offsets*/) { // Construct BlockRunLengthDecode and initialize with the run lengths return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_lengths, decoded_size); } __device__ __forceinline__ void LoadRuns(ItemItT d_block_unique_items, RunLengthsItT d_block_run_lengths, RunItemT (&unique_items)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], size_t num_valid_items) { if (num_valid_items < RUNS_PER_BLOCK) { BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items, num_valid_items); } else { BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items); } // Ensure BlockLoad's temporary shared memory can be repurposed CTA_SYNC(); // Load this block's tile of run lengths if (num_valid_items < RUNS_PER_BLOCK) BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage) .Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast(0)); else BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage).Load(d_block_run_lengths, run_lengths); // Ensure temporary shared memory can be repurposed CTA_SYNC(); } public: union TempStorage { typename BlockLoadRunItemT::TempStorage load_uniques_storage; typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage; cub::detail::conditional_t run_offsets_scan_storage; struct { typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage; typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage; typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets; } decode; }; TempStorage &temp_storage; __device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage &temp_storage) : temp_storage(temp_storage) {} /** * \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length decoded) size. */ __device__ __forceinline__ uint32_t GetDecodedSize(ItemItT d_block_unique_items, RunLengthsItT d_block_run_lengths, size_t num_valid_runs) { // Load this block's tile of encoded runs RunItemT unique_items[RUNS_PER_THREAD]; RunLengthT run_lengths[RUNS_PER_THREAD]; LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs); // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the "decompressed" size) uint32_t decoded_size = 0U; BlockRunLengthDecodeT run_length_decode = InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type()); return decoded_size; } /** * \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results to \p * d_block_decoded_out. */ template __device__ __forceinline__ uint32_t WriteDecodedRuns(ItemItT d_block_unique_items, RunLengthsItT d_block_run_lengths, UniqueItemOutItT d_block_decoded_out, RelativeOffsetOutItT d_block_rel_out, size_t num_valid_runs) { // Load this block's tile of encoded runs RunItemT unique_items[RUNS_PER_THREAD]; RunLengthT run_lengths[RUNS_PER_THREAD]; LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs); // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the "decompressed" size) uint32_t decoded_size = 0U; BlockRunLengthDecodeT run_length_decode = InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type()); // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs // have been decoded. uint32_t decoded_window_offset = 0U; while (decoded_window_offset < decoded_size) { RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; // The number of decoded items that are valid within this window (aka pass) of run-length decoding uint32_t num_valid_items = decoded_size - decoded_window_offset; run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset); BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage) .Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items); if (TEST_RELATIVE_OFFSETS) { BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets) .Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items); } decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS; } return decoded_size; } }; /****************************************************************************** * [STAGE 1] RUN-LENGTH DECODING TEST KERNEL ******************************************************************************/ template __launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__ void BlockRunLengthDecodeGetSizeKernel(const ItemItT d_unique_items, const RunLengthsItT d_run_lengths, const OffsetT num_runs, DecodedSizesOutT d_decoded_sizes) { constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK; __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage; OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK; OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK; AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage); uint64_t num_decoded_items = run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset, d_run_lengths + block_offset, num_valid_runs); d_decoded_sizes[blockIdx.x] = num_decoded_items; } /****************************************************************************** * [STAGE 2] RUN-LENGTH DECODING TEST KERNEL ******************************************************************************/ template __launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__ void BlockRunLengthDecodeTestKernel(const ItemItT d_unique_items, const RunLengthsItT d_run_lengths, const DecodedSizesOutT d_decoded_offsets, const OffsetT num_runs, DecodedItemsOutItT d_decoded_items, RelativeOffsetOutItT d_relative_offsets) { constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK; __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage; OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK; OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK; AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage); run_length_decode_agent.WriteDecodedRuns(d_unique_items + block_offset, d_run_lengths + block_offset, d_decoded_items + d_decoded_offsets[blockIdx.x], d_relative_offsets + d_decoded_offsets[blockIdx.x], num_valid_runs); } struct ModOp { using T = uint32_t; __host__ __device__ __forceinline__ T operator()(const T &x) const { return 1 + (x % 100); } }; template void TestAlgorithmSpecialisation() { constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; constexpr uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * THREADS_PER_BLOCK; using RunItemT = float; using RunLengthT = uint32_t; using ItemItT = cub::CountingInputIterator; using RunLengthsItT = cub::TransformInputIterator>; ItemItT d_unique_items(1000U); RunLengthsItT d_run_lengths(cub::CountingInputIterator(0), ModOp{}); constexpr uint32_t num_runs = 10000; constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK; size_t temp_storage_bytes = 0ULL; void *temp_storage = nullptr; uint32_t *h_num_decoded_total = nullptr; uint32_t *d_decoded_sizes = nullptr; uint32_t *d_decoded_offsets = nullptr; RunItemT *d_decoded_out = nullptr; RunLengthT *d_relative_offsets = nullptr; RunItemT *h_decoded_out = nullptr; RunLengthT *h_relative_offsets = nullptr; using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode; enum : uint32_t { TIMER_SIZE_BEGIN = 0, TIMER_SIZE_END, TIMER_DECODE_BEGIN, TIMER_DECODE_END, NUM_TIMERS, }; cudaStream_t stream; cudaStreamCreate(&stream); cudaEvent_t cuda_evt_timers[NUM_TIMERS]; for (uint32_t i = 0; i < NUM_TIMERS; i++) { cudaEventCreate(&cuda_evt_timers[i]); } // Get temporary storage requirements for the scan (for computing offsets for the per-block run-length decoded items) cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, d_decoded_sizes, d_decoded_offsets, num_blocks, stream); // Allocate device memory CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes)); CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes))); // Allocate for the exclusive sum PLUS the overall aggregate CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets))); CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total))); // Get the per-block number of items being decoded (i-th thread block writing size to d_decoded_sizes[i]) CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_SIZE_BEGIN], stream)); BlockRunLengthDecodeGetSizeKernel <<>>(d_unique_items, d_run_lengths, num_runs, d_decoded_sizes); CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_SIZE_END], stream)); // Compute offsets for the runs decoded by each block (exclusive sum + aggregate) CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0]), stream)); CubDebugExit(cub::DeviceScan::InclusiveSum(temp_storage, temp_storage_bytes, d_decoded_sizes, &d_decoded_offsets[1], num_blocks, stream)); // Copy the total decoded size to CPU in order to allocate just the right amount of device memory CubDebugExit(cudaMemcpyAsync(h_num_decoded_total, &d_decoded_offsets[num_blocks], sizeof(*h_num_decoded_total), cudaMemcpyDeviceToHost, stream)); // Ensure the total decoded size has been copied from GPU to CPU CubDebugExit(cudaStreamSynchronize(stream)); // Allocate device memory for the run-length decoded output CubDebugExit(cudaMallocHost(&h_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT))); CubDebugExit(cudaMalloc(&d_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT))); if (TEST_RELATIVE_OFFSETS) { CubDebugExit(cudaMalloc(&d_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT))); CubDebugExit(cudaMallocHost(&h_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT))); } // Perform the block-wise run-length decoding (each block taking its offset from d_decoded_offsets) CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_DECODE_BEGIN], stream)); BlockRunLengthDecodeTestKernel <<>>(d_unique_items, d_run_lengths, d_decoded_offsets, num_runs, d_decoded_out, d_relative_offsets); CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_DECODE_END], stream)); // Copy back results for verification CubDebugExit(cudaMemcpyAsync(h_decoded_out, d_decoded_out, (*h_num_decoded_total) * sizeof(*h_decoded_out), cudaMemcpyDeviceToHost, stream)); if (TEST_RELATIVE_OFFSETS) { // Copy back the relative offsets CubDebugExit(cudaMemcpyAsync(h_relative_offsets, d_relative_offsets, (*h_num_decoded_total) * sizeof(*h_relative_offsets), cudaMemcpyDeviceToHost, stream)); } // Generate host-side run-length decoded data for verification std::vector> host_golden; host_golden.reserve(*h_num_decoded_total); for (uint32_t run = 0; run < num_runs; run++) { for (RunLengthT i = 0; i < d_run_lengths[run]; i++) { host_golden.push_back({d_unique_items[run], i}); } } // Ensure the run-length decoded result has been copied to the host CubDebugExit(cudaStreamSynchronize(stream)); // Verify the total run-length decoded size is correct AssertEquals(host_golden.size(), h_num_decoded_total[0]); float duration_size = 0.0f; float duration_decode = 0.0f; cudaEventElapsedTime(&duration_size, cuda_evt_timers[TIMER_SIZE_BEGIN], cuda_evt_timers[TIMER_SIZE_END]); cudaEventElapsedTime(&duration_decode, cuda_evt_timers[TIMER_DECODE_BEGIN], cuda_evt_timers[TIMER_DECODE_END]); size_t decoded_bytes = host_golden.size() * sizeof(RunItemT); size_t relative_offsets_bytes = TEST_RELATIVE_OFFSETS ? host_golden.size() * sizeof(RunLengthT) : 0ULL; size_t total_bytes_written = decoded_bytes + relative_offsets_bytes; std::cout << "MODE: " << (TEST_RELATIVE_OFFSETS ? "offsets, " : "normal, ") // << "INIT: " << (TEST_RUN_OFFSETS ? "run offsets, " : "run lengths, ") // << "RUNS_PER_THREAD: " << RUNS_PER_THREAD // << ", DECODED_ITEMS_PER_THREAD: " << DECODED_ITEMS_PER_THREAD // << ", THREADS_PER_BLOCK: " << THREADS_PER_BLOCK // << ", decoded size (bytes): " << decoded_bytes // << ", relative offsets (bytes): " << relative_offsets_bytes // << ", time_size (ms): " << duration_size // << ", time_decode (ms): " << duration_decode // << ", achieved decode BW (GB/s): " << ((static_cast(total_bytes_written) / 1.0e9) * (1000.0 / duration_decode)) << "\n"; // Verify the run-length decoded data is correct bool cmp_eq = true; for (uint32_t i = 0; i < host_golden.size(); i++) { if (host_golden[i].first != h_decoded_out[i]) { std::cout << "Mismatch at #" << i << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i] << "\n"; cmp_eq = false; } if (TEST_RELATIVE_OFFSETS) { if (host_golden[i].second != h_relative_offsets[i]) { std::cout << "Mismatch of relative offset at #" << i << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i] << "; relative offsets: CPU: " << host_golden[i].second << ", GPU: " << h_relative_offsets[i] << "\n"; cmp_eq = false; break; } } } AssertEquals(cmp_eq, true); // Clean up memory allocations CubDebugExit(cudaFree(temp_storage)); CubDebugExit(cudaFree(d_decoded_sizes)); CubDebugExit(cudaFree(d_decoded_offsets)); CubDebugExit(cudaFree(d_decoded_out)); CubDebugExit(cudaFreeHost(h_num_decoded_total)); CubDebugExit(cudaFreeHost(h_decoded_out)); if (TEST_RELATIVE_OFFSETS) { CubDebugExit(cudaFree(d_relative_offsets)); CubDebugExit(cudaFreeHost(h_relative_offsets)); } // Clean up events for (uint32_t i = 0; i < NUM_TIMERS; i++) { CubDebugExit(cudaEventDestroy(cuda_evt_timers[i])); } // Clean up streams CubDebugExit(cudaStreamDestroy(stream)); } template void TestForTuningParameters() { constexpr bool DO_TEST_RELATIVE_OFFSETS = true; constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false; constexpr bool TEST_WITH_RUN_OFFSETS = true; constexpr bool TEST_WITH_RUN_LENGTHS = false; // Run BlockRunLengthDecode that uses run lengths and generates offsets relative to each run TestAlgorithmSpecialisation(); // Run BlockRunLengthDecode that uses run lengths and performs normal run-length decoding TestAlgorithmSpecialisation(); // Run BlockRunLengthDecode that uses run offsets and generates offsets relative to each run TestAlgorithmSpecialisation(); // Run BlockRunLengthDecode that uses run offsets and performs normal run-length decoding TestAlgorithmSpecialisation(); } int main(int argc, char **argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); // Instantiate test template instances for various configurations (tuning parameter dimensions) // TestForTuningParameters<1U, 1U, 64U>(); TestForTuningParameters<1U, 3U, 32U, 2U, 3U>(); TestForTuningParameters<1U, 1U, 128U>(); TestForTuningParameters<1U, 8U, 128U>(); TestForTuningParameters<2U, 8U, 128U>(); TestForTuningParameters<3U, 1U, 256U>(); TestForTuningParameters<1U, 8U, 256U>(); TestForTuningParameters<8U, 1U, 256U>(); TestForTuningParameters<1U, 1U, 256U>(); TestForTuningParameters<2U, 2U, 384U>(); return 0; } cub-2.0.1/test/test_block_scan.cu000066400000000000000000001002501434614775400167450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockScan utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); /** * Primitive variant to test */ enum TestMode { BASIC, AGGREGATE, PREFIX, }; /** * Scan mode to test */ enum ScanMode { EXCLUSIVE, INCLUSIVE }; /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ template struct WrapperFunctor { OpT op; WrapperFunctor(OpT op) : op(op) {} template __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const { return static_cast(op(a, b)); } }; /** * Stateful prefix functor */ template < typename T, typename ScanOpT> struct BlockPrefixCallbackOp { int linear_tid; T prefix; ScanOpT scan_op; __device__ __forceinline__ BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) : linear_tid(linear_tid), prefix(prefix), scan_op(scan_op) {} __device__ __forceinline__ T operator()(T block_aggregate) { // For testing purposes T retval = (linear_tid == 0) ? prefix : T(); prefix = scan_op(prefix, block_aggregate); return retval; } }; //--------------------------------------------------------------------- // Exclusive scan //--------------------------------------------------------------------- /// Exclusive scan (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op); } /// Exclusive scan (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data, data, initial_value, scan_op); } /// Exclusive scan (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate); } /// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate); } /// Exclusive scan (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op); } /// Exclusive scan (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data, data, scan_op, prefix_op); } //--------------------------------------------------------------------- // Exclusive sum //--------------------------------------------------------------------- /// Exclusive sum (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data[0], data[0]); } /// Exclusive sum (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data, data); } /// Exclusive sum (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data[0], data[0], block_aggregate); } /// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data, data, block_aggregate); } /// Exclusive sum (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data[0], data[0], prefix_op); } /// Exclusive sum (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data, data, prefix_op); } //--------------------------------------------------------------------- // Inclusive scan //--------------------------------------------------------------------- /// Inclusive scan (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data[0], data[0], scan_op); } /// Inclusive scan (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data, data, scan_op); } /// Inclusive scan (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate); } /// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data, data, scan_op, block_aggregate); } /// Inclusive scan (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op); } /// Inclusive scan (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data, data, scan_op, prefix_op); } //--------------------------------------------------------------------- // Inclusive sum //--------------------------------------------------------------------- /// Inclusive sum (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data[0], data[0]); } /// Inclusive sum (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data, data); } /// Inclusive sum (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data[0], data[0], block_aggregate); } /// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data, data, block_aggregate); } /// Inclusive sum (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data[0], data[0], prefix_op); } /// Inclusive sum (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data, data, prefix_op); } //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * BlockScan test kernel. */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename T, typename ScanOpT> __launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) __global__ void BlockScanKernel( T *d_in, T *d_out, T *d_aggregate, ScanOpT scan_op, T initial_value, clock_t *d_elapsed) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Parameterize BlockScan type for our thread block typedef BlockScan BlockScanT; // Allocate temp storage in shared memory __shared__ typename BlockScanT::TempStorage temp_storage; int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); // Per-thread tile data T data[ITEMS_PER_THREAD]; LoadDirectBlocked(linear_tid, d_in, data); __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Test scan T block_aggregate; BlockScanT block_scan(temp_storage); BlockPrefixCallbackOp prefix_op(linear_tid, initial_value, scan_op); DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op, Int2Type(), Int2Type(), Int2Type::PRIMITIVE>()); // Stop cycle timer __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Store output StoreDirectBlocked(linear_tid, d_out, data); // Store block_aggregate if (TEST_MODE != BASIC) d_aggregate[linear_tid] = block_aggregate; // Store prefix if (TEST_MODE == PREFIX) { if (linear_tid == 0) d_out[TILE_SIZE] = prefix_op.prefix; } // Store time if (linear_tid == 0) *d_elapsed = (start > stop) ? start - stop : stop - start; } //--------------------------------------------------------------------- // Host utility subroutines //--------------------------------------------------------------------- /** * Initialize exclusive-scan problem (and solution) */ template T Initialize( GenMode gen_mode, T *h_in, T *h_reference, int num_items, ScanOpT scan_op, T initial_value, Int2Type) { InitValue(gen_mode, h_in[0], 0); T block_aggregate = h_in[0]; h_reference[0] = initial_value; T inclusive = static_cast(scan_op(initial_value, h_in[0])); for (int i = 1; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); h_reference[i] = inclusive; inclusive = static_cast(scan_op(inclusive, h_in[i])); block_aggregate = static_cast(scan_op(block_aggregate, h_in[i])); } return block_aggregate; } /** * Initialize inclusive-scan problem (and solution) */ template T Initialize( GenMode gen_mode, T *h_in, T *h_reference, int num_items, ScanOpT scan_op, T initial_value, Int2Type) { InitValue(gen_mode, h_in[0], 0); T block_aggregate = h_in[0]; T inclusive = static_cast(scan_op(initial_value, h_in[0])); h_reference[0] = inclusive; for (int i = 1; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); inclusive = static_cast(scan_op(inclusive, h_in[i])); block_aggregate = static_cast(scan_op(block_aggregate, h_in[i])); h_reference[i] = inclusive; } return block_aggregate; } /** * Test thread block scan. (Specialized for sufficient resources) */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value, Int2Type /*sufficient_resources*/) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays T *h_in = new T[TILE_SIZE]; T *h_reference = new T[TILE_SIZE]; T *h_aggregate = new T[BLOCK_THREADS]; // Initialize problem T block_aggregate = Initialize( gen_mode, h_in, h_reference, TILE_SIZE, scan_op, initial_value, Int2Type()); // Test reference block_aggregate is returned in all threads for (int i = 0; i < BLOCK_THREADS; ++i) { h_aggregate[i] = block_aggregate; } // Run kernel printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) thread block threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n", TEST_MODE, gen_mode, ALGORITHM, (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(), BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, TILE_SIZE, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Initialize/clear device arrays T *d_in = NULL; T *d_out = NULL; T *d_aggregate = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS)); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1))); CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) { std::cout << CoutCast(h_in[i]) << ", "; } printf("\n\n"); } // Run block_aggregate/prefix kernel dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); BlockScanKernel<<<1, block_dims>>>( d_in, d_out, d_aggregate, scan_op, initial_value, d_elapsed); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tScan results: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); if (TEST_MODE == AGGREGATE) { // Copy out and display block_aggregate printf("\tScan block aggregate: "); compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } if (TEST_MODE == PREFIX) { // Copy out and display updated prefix printf("\tScan running total: "); T running_total = static_cast(scan_op(initial_value, block_aggregate)); compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_aggregate) delete[] h_aggregate; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Test thread block scan. (Specialized for insufficient resources) */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode /*gen_mode*/, ScanOpT /*scan_op*/, T /*initial_value*/, Int2Type /*sufficient_resources*/) {} /** * Test thread block scan. */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { // Check size of smem storage for the target arch to make sure it will fit typedef BlockScan BlockScanT; enum { sufficient_smem = (sizeof(typename BlockScanT::TempStorage) <= 16 * 1024), sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024), sufficient_resources = (sufficient_smem && sufficient_threads), }; Test( gen_mode, scan_op, initial_value, Int2Type()); } /** * Run test for different thread block dimensions */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { Test(gen_mode, scan_op, initial_value); Test(gen_mode, scan_op, initial_value); } /** * Run test for different policy types */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { Test(gen_mode, scan_op, initial_value); Test(gen_mode, scan_op, initial_value); Test(gen_mode, scan_op, initial_value); } /** * Run tests for different primitive variants */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T identity, T initial_value) { // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values) Test(gen_mode, scan_op, identity); Test(gen_mode, scan_op, identity); Test(gen_mode, scan_op, identity); // Exclusive (non-specialized, so we can use initial-value) Test(gen_mode, WrapperFunctor(scan_op), initial_value); Test(gen_mode, WrapperFunctor(scan_op), initial_value); Test(gen_mode, WrapperFunctor(scan_op), initial_value); // Inclusive Test(gen_mode, scan_op, identity); // This scan doesn't take an initial value Test(gen_mode, scan_op, identity); // This scan doesn't take an initial value Test(gen_mode, scan_op, initial_value); } /** * Run tests for different problem-generation options */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename ScanOpT, typename T> void Test( ScanOpT scan_op, T identity, T initial_value) { Test(UNIFORM, scan_op, identity, initial_value); Test(INTEGER_SEED, scan_op, identity, initial_value); // Don't test randomly-generated floats b/c of stability if (Traits::CATEGORY != FLOATING_POINT) Test(RANDOM, scan_op, identity, initial_value); } // Dispatch ITEMS_PER_THREAD template void Test(ScanOpT op, T identity, T initial_value) { Test(op, identity, initial_value); Test(op, identity, initial_value); } // Dispatch BLOCK_THREADS template void Test(ScanOpT op, T identity, T initial_value) { Test<17>(op, identity, initial_value); Test<32>(op, identity, initial_value); Test<65>(op, identity, initial_value); Test<96>(op, identity, initial_value); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5:6:7:8:9 #if TEST_VALUE_TYPES == 0 // primitive Test(Sum(), static_cast(0), static_cast(99)); Test(Sum(), static_cast(0), static_cast(99)); #elif TEST_VALUE_TYPES == 1 Test(Sum(), static_cast(0), static_cast(99)); Test(Sum(), static_cast(0), static_cast(99)); #elif TEST_VALUE_TYPES == 2 // primitive (alternative scan op) Test(Max(), std::numeric_limits::lowest(), static_cast(99)); Test(Max(), std::numeric_limits::lowest(), static_cast(99)); #elif TEST_VALUE_TYPES == 3 Test(Max(), std::numeric_limits::lowest(), static_cast(99)); Test(Max(), std::numeric_limits::lowest(), static_cast(99)); #elif TEST_VALUE_TYPES == 4 // Floats Test(Sum(), static_cast(0), static_cast(99)); Test(Max(), std::numeric_limits::lowest(), static_cast(99)); #elif TEST_VALUE_TYPES == 5 // vec-1 Test(Sum(), make_uchar1(0), make_uchar1(17)); // vec-2 Test(Sum(), make_uchar2(0, 0), make_uchar2(17, 21)); Test(Sum(), make_ushort2(0, 0), make_ushort2(17, 21)); #elif TEST_VALUE_TYPES == 6 Test(Sum(), make_uint2(0, 0), make_uint2(17, 21)); Test(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21)); #elif TEST_VALUE_TYPES == 7 // vec-4 Test(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); Test(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85)); #elif TEST_VALUE_TYPES == 8 Test(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85)); Test(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85)); #elif TEST_VALUE_TYPES == 9 // complex Test(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85)); Test(Sum(), TestBar(0, 0), TestBar(17, 21)); #endif return 0; } cub-2.0.1/test/test_block_shuffle.cu000066400000000000000000000266101434614775400174640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockShuffle utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include "test_util.h" using namespace cub; template __global__ void IotaKernel( const unsigned int num_items, DataType *data) { const unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < num_items) { data[i] = i; } } template void Iota( const unsigned int num_items, DataType *data) { const unsigned int ThreadsPerBlock = 256; const unsigned int blocks_per_grid = (num_items + ThreadsPerBlock - 1) / ThreadsPerBlock; IotaKernel<<>>(num_items, data); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template < typename DataType, unsigned int ThreadsInBlock, unsigned int ItemsPerThread, typename ActionType> __global__ void BlockShuffleTestKernel( DataType *data, ActionType action) { typedef cub::BlockShuffle BlockShuffle; __shared__ typename BlockShuffle::TempStorage temp_storage_shuffle; DataType thread_data[ItemsPerThread]; data += threadIdx.x * ItemsPerThread; for (unsigned int item = 0; item < ItemsPerThread; item++) { thread_data[item] = data[item]; } __syncthreads(); BlockShuffle block_shuffle(temp_storage_shuffle); action(block_shuffle, thread_data); for (unsigned int item = 0; item < ItemsPerThread; item++) { data[item] = thread_data[item]; } } template< typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock, typename ActionType> void BlockShuffleTest(DataType *data, ActionType action) { BlockShuffleTestKernel<<<1, ThreadsInBlock>>> (data, action); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template < typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> struct UpTest { __device__ void operator()( BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Up(thread_data, thread_data); } static __host__ bool check(const DataType *data, int i) { if (i == 0) { return data[i] == 0; } return data[i] == i - 1; } }; template < typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> struct DownTest { __device__ void operator()( BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Down(thread_data, thread_data); } static __host__ bool check(const DataType *data, int i) { if (i == ItemsPerThread * ThreadsInBlock - 1) { return data[i] == i; } return data[i] == i + 1; } }; template struct OffsetTestBase { static constexpr unsigned int ItemsPerThread = 1; __device__ void operator()( BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Offset(thread_data[0], thread_data[0], offset); } }; template struct OffsetUpTest : public OffsetTestBase { static __host__ bool check(const DataType *data, int i) { return UpTest::check (data, i); } }; template struct OffsetDownTest : public OffsetTestBase { static __host__ bool check(const DataType *data, int i) { return DownTest::check (data, i); } }; template struct RotateTestBase { static constexpr unsigned int ItemsPerThread = 1; __device__ void operator()( BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Rotate(thread_data[0], thread_data[0], offset); } static __host__ bool check(const DataType *data, int i) { return data[i] == static_cast((i + offset) % ThreadsInBlock); } }; template struct RotateUpTest : public RotateTestBase { }; template struct RotateTest : public RotateTestBase { }; template int CheckResult( int num_items, const DataType *d_output, DataType *h_output, const TestType &test) { CubDebugExit(cudaMemcpy(h_output, d_output, num_items * sizeof (DataType), cudaMemcpyDeviceToHost)); for (int i = 0; i < num_items; i++) { if (!test.check (h_output, i)) { return 1; } } return 0; } template < typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock, template class TestType> void Test(unsigned int num_items, DataType *d_data, DataType *h_data) { TestType test; Iota(num_items, d_data); BlockShuffleTest(d_data, test); AssertEquals(0, CheckResult(num_items, d_data, h_data, test)); } /** * Some methods of it only support a single element per thread. * This structure skips tests for unsupported cases. */ template < typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock, template class TestType> struct SingleItemTestHelper { static void run(unsigned int /* num_items */, DataType * /* d_data */, DataType * /* h_data */) { } }; template < typename DataType, unsigned int ThreadsInBlock, template class TestType> struct SingleItemTestHelper { static void run(unsigned int num_items, DataType *d_data, DataType *h_data) { TestType test; Iota(num_items, d_data); BlockShuffleTest(d_data, test); AssertEquals(0, CheckResult(num_items, d_data, h_data, test)); } }; template < typename DataType, unsigned int ItemsPerThread, unsigned int ThreadsInBlock> void Test( CachingDeviceAllocator &g_allocator ) { const unsigned int num_items = ItemsPerThread * ThreadsInBlock; DataType *d_data = nullptr; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(DataType) * num_items)); std::unique_ptr h_data(new DataType[num_items]); Test(num_items, d_data, h_data.get()); Test(num_items, d_data, h_data.get()); SingleItemTestHelper().run(num_items, d_data, h_data.get()); SingleItemTestHelper().run(num_items, d_data, h_data.get()); SingleItemTestHelper().run(num_items, d_data, h_data.get()); SingleItemTestHelper().run(num_items, d_data, h_data.get()); if (d_data) { CubDebugExit(g_allocator.DeviceFree(d_data)); } } template void Test(CachingDeviceAllocator &g_allocator) { Test(g_allocator); Test(g_allocator); Test(g_allocator); Test(g_allocator); Test(g_allocator); } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); CachingDeviceAllocator g_allocator(true); Test<1> (g_allocator); Test<2> (g_allocator); Test<15> (g_allocator); return 0; } cub-2.0.1/test/test_cdp_variant_state.cu000066400000000000000000000020151434614775400203410ustar00rootroot00000000000000/* * Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include int main() { // This test just checks that RDC is enabled and detected properly when using // the %PARAM% system to request CDP support (see the README.md file in // this directory). // %PARAM% TEST_CDP cdp 0:1 #ifdef CUB_RDC_ENABLED return (TEST_CDP == 1) ? EXIT_SUCCESS : EXIT_FAILURE; #else return (TEST_CDP == 0) ? EXIT_SUCCESS : EXIT_FAILURE; #endif } cub-2.0.1/test/test_device_adjacent_difference.cu000066400000000000000000000533011434614775400221150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; constexpr bool READ_LEFT = true; constexpr bool READ_RIGHT = false; /** * \brief Generates integer sequence \f$S_n=i(i-1)/2\f$. * * The adjacent difference of this sequence produce consecutive numbers: * \f[ * p = \frac{i(i - 1)}{2} \\ * n = \frac{(i + 1) i}{2} \\ * n - p = i \\ * \frac{(i + 1) i}{2} - \frac{i (i - 1)}{2} = i \\ * (i + 1) i - i (i - 1) = 2 i \\ * (i + 1) - (i - 1) = 2 \\ * 2 = 2 * \f] */ template struct TestSequenceGenerator { template __device__ __host__ DestT operator()(SourceT index) const { return static_cast(index * (index - 1) / SourceT(2)); } }; template struct CustomDifference { template __device__ OutputT operator()(const InputT &lhs, const InputT &rhs) { return static_cast(lhs - rhs); } }; template void AdjacentDifference(void *temp_storage, std::size_t &temp_storage_bytes, IteratorT it, DifferenceOpT difference_op, NumItemsT num_items) { const bool is_default_op_in_use = std::is_same::value; if (ReadLeft) { if (is_default_op_in_use) { CubDebugExit( cub::DeviceAdjacentDifference::SubtractLeft(temp_storage, temp_storage_bytes, it, num_items)); } else { CubDebugExit( cub::DeviceAdjacentDifference::SubtractLeft(temp_storage, temp_storage_bytes, it, num_items, difference_op)); } } else { if (is_default_op_in_use) { CubDebugExit( cub::DeviceAdjacentDifference::SubtractRight(temp_storage, temp_storage_bytes, it, num_items)); } else { CubDebugExit( cub::DeviceAdjacentDifference::SubtractRight(temp_storage, temp_storage_bytes, it, num_items, difference_op)); } } } template void AdjacentDifferenceCopy(void *temp_storage, std::size_t &temp_storage_bytes, InputIteratorT input, OutputIteratorT output, DifferenceOpT difference_op, NumItemsT num_items) { const bool is_default_op_in_use = std::is_same::value; if (ReadLeft) { if (is_default_op_in_use) { CubDebugExit( cub::DeviceAdjacentDifference::SubtractLeftCopy(temp_storage, temp_storage_bytes, input, output, num_items)); } else { CubDebugExit( cub::DeviceAdjacentDifference::SubtractLeftCopy(temp_storage, temp_storage_bytes, input, output, num_items, difference_op)); } } else { if (is_default_op_in_use) { CubDebugExit( cub::DeviceAdjacentDifference::SubtractRightCopy(temp_storage, temp_storage_bytes, input, output, num_items)); } else { CubDebugExit( cub::DeviceAdjacentDifference::SubtractRightCopy(temp_storage, temp_storage_bytes, input, output, num_items, difference_op)); } } } template void AdjacentDifference(IteratorT it, DifferenceOpT difference_op, NumItemsT num_items) { std::size_t temp_storage_bytes {}; AdjacentDifference(nullptr, temp_storage_bytes, it, difference_op, num_items); thrust::device_vector temp_storage(temp_storage_bytes); AdjacentDifference(thrust::raw_pointer_cast(temp_storage.data()), temp_storage_bytes, it, difference_op, num_items); } template void AdjacentDifferenceCopy(InputIteratorT input, OutputIteratorT output, DifferenceOpT difference_op, NumItemsT num_items) { std::size_t temp_storage_bytes{}; AdjacentDifferenceCopy(nullptr, temp_storage_bytes, input, output, difference_op, num_items); thrust::device_vector temp_storage(temp_storage_bytes); AdjacentDifferenceCopy(thrust::raw_pointer_cast( temp_storage.data()), temp_storage_bytes, input, output, difference_op, num_items); } template bool CheckResult(FirstIteratorT first_begin, FirstIteratorT first_end, SecondOperatorT second_begin) { auto err = thrust::mismatch(first_begin, first_end, second_begin); if (err.first != first_end) { return false; } return true; } template void TestCopy(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector input(elements); thrust::tabulate(input.begin(), input.end(), TestSequenceGenerator{}); thrust::device_vector output(elements, OutputT{42}); InputT *d_input = thrust::raw_pointer_cast(input.data()); OutputT *d_output = thrust::raw_pointer_cast(output.data()); using CountingIteratorT = typename thrust::counting_iterator; AdjacentDifferenceCopy(d_input, d_output, difference_op, elements); AssertTrue(CheckResult(output.begin() + 1, output.end(), CountingIteratorT(OutputT{0}))); thrust::fill(output.begin(), output.end(), OutputT{42}); AdjacentDifferenceCopy(d_input, d_output, difference_op, elements); thrust::device_vector reference(input.size()); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(output.begin(), output.end() - 1, reference.begin())); } template void TestIteratorCopy(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector input(elements); thrust::tabulate(input.begin(), input.end(), TestSequenceGenerator{}); thrust::device_vector output(elements, OutputT{42}); using CountingIteratorT = typename thrust::counting_iterator; AdjacentDifferenceCopy(input.cbegin(), output.begin(), difference_op, elements); AssertTrue(CheckResult(output.begin() + 1, output.end(), CountingIteratorT(OutputT{0}))); thrust::fill(output.begin(), output.end(), OutputT{42}); AdjacentDifferenceCopy(input.cbegin(), output.begin(), difference_op, elements); thrust::device_vector reference(input.size()); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(output.begin(), output.end() - 1, reference.begin())); } template void TestCopy(NumItemsT elements) { TestCopy(elements, cub::Difference{}); TestCopy(elements, CustomDifference{}); TestIteratorCopy(elements, cub::Difference{}); TestIteratorCopy(elements, CustomDifference{}); } template void TestCopy(NumItemsT elements) { TestCopy(elements); TestCopy(elements); } template void Test(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector data(elements); thrust::tabulate(data.begin(), data.end(), TestSequenceGenerator{}); T *d_data = thrust::raw_pointer_cast(data.data()); using CountingIteratorT = typename thrust::counting_iterator; AdjacentDifference(d_data, difference_op, elements); AssertTrue(CheckResult(data.begin() + 1, data.end(), CountingIteratorT(T{0}))); thrust::tabulate(data.begin(), data.end(), TestSequenceGenerator{}); AdjacentDifference(d_data, difference_op, elements); thrust::device_vector reference(data.size()); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(data.begin(), data.end() - 1, reference.begin())); } template void TestIterators(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector data(elements); thrust::tabulate(data.begin(), data.end(), TestSequenceGenerator{}); using CountingIteratorT = typename thrust::counting_iterator; AdjacentDifference(data.begin(), difference_op, elements); AssertTrue(CheckResult(data.begin() + 1, data.end(), CountingIteratorT(T{0}))); thrust::tabulate(data.begin(), data.end(), TestSequenceGenerator{}); AdjacentDifference(data.begin(), difference_op, elements); thrust::device_vector reference(data.size()); thrust::sequence(reference.begin(), reference.end(), static_cast(0), static_cast(-1)); AssertTrue(CheckResult(data.begin(), data.end() - 1, reference.begin())); } template void Test(NumItemsT elements) { Test(elements, cub::Difference{}); Test(elements, CustomDifference{}); TestIterators(elements, cub::Difference{}); TestIterators(elements, CustomDifference{}); } template void Test(NumItemsT elements) { Test(elements); Test(elements); Test(elements); } template void TestFancyIterators(NumItemsT elements) { if (elements == 0) { return; } thrust::counting_iterator count_iter(ValueT{1}); thrust::device_vector output(elements, ValueT{42}); AdjacentDifferenceCopy(count_iter, output.begin(), cub::Difference{}, elements); AssertEquals(elements, static_cast( thrust::count(output.begin(), output.end(), ValueT(1)))); thrust::fill(output.begin(), output.end(), ValueT{}); AdjacentDifferenceCopy(count_iter, output.begin(), cub::Difference{}, elements); AssertEquals(elements - 1, static_cast( thrust::count(output.begin(), output.end() - 1, static_cast(-1)))); AssertEquals(output.back(), static_cast(elements)); thrust::constant_iterator const_iter(ValueT{}); AdjacentDifferenceCopy(const_iter, output.begin(), cub::Difference{}, elements); AssertEquals(elements, static_cast( thrust::count(output.begin(), output.end(), ValueT{}))); thrust::fill(output.begin(), output.end(), ValueT{}); AdjacentDifferenceCopy(const_iter, output.begin(), cub::Difference{}, elements); AssertEquals(elements, static_cast( thrust::count(output.begin(), output.end(), ValueT{}))); AdjacentDifferenceCopy(const_iter, thrust::make_discard_iterator(), cub::Difference{}, elements); AdjacentDifferenceCopy(const_iter, thrust::make_discard_iterator(), cub::Difference{}, elements); } template void TestFancyIterators(NumItemsT elements) { TestFancyIterators(elements); } template void TestSize(NumItemsT elements) { Test(elements); TestCopy(elements); TestFancyIterators(elements); } struct DetectWrongDifference { bool *flag; __host__ __device__ DetectWrongDifference operator++() const { return *this; } __host__ __device__ DetectWrongDifference operator*() const { return *this; } template __host__ __device__ DetectWrongDifference operator+(Difference) const { return *this; } template __host__ __device__ DetectWrongDifference operator[](Index) const { return *this; } __device__ void operator=(long long difference) const { if (difference != 1) { *flag = false; } } }; void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude) { const std::size_t elements = 1ll << magnitude; thrust::device_vector all_differences_correct(1, true); thrust::counting_iterator in(1); DetectWrongDifference out = { thrust::raw_pointer_cast(all_differences_correct.data()) }; AdjacentDifferenceCopy(in, out, cub::Difference{}, elements); AssertEquals(all_differences_correct.front(), true); } void TestAdjacentDifferenceWithBigIndexes() { TestAdjacentDifferenceWithBigIndexesHelper(30); TestAdjacentDifferenceWithBigIndexesHelper(31); TestAdjacentDifferenceWithBigIndexesHelper(32); TestAdjacentDifferenceWithBigIndexesHelper(33); } struct InvocationsCounter { int *m_d_counts{}; explicit InvocationsCounter(int *d_counts) : m_d_counts(d_counts) {} __device__ int operator()(int l, int /* r */) const { atomicAdd(m_d_counts + l, 1); return l; } }; void TestAdjacentDifferenceOpInvocationsNum(int num_items) { auto in = thrust::make_counting_iterator(0); auto out = thrust::make_discard_iterator(); thrust::device_vector num_of_invocations(num_items, 0); InvocationsCounter op{thrust::raw_pointer_cast(num_of_invocations.data())}; AdjacentDifferenceCopy(in, out, op, num_items); AssertEquals( num_items - 1, thrust::count(num_of_invocations.begin() + 1, num_of_invocations.end(), 1)); AssertEquals(0, num_of_invocations[0]); thrust::fill_n(num_of_invocations.begin(), num_items, 0); AdjacentDifferenceCopy(in, out, op, num_items); AssertEquals( num_items - 1, thrust::count(num_of_invocations.begin(), num_of_invocations.end() - 1, 1)); AssertEquals(0, num_of_invocations[num_items - 1]); } void TestAdjacentDifferenceOpInvocationsNum() { for (int num_items = 1; num_items < 4096; num_items *= 2) { TestAdjacentDifferenceOpInvocationsNum(num_items); } } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); TestSize(0); for (std::size_t power_of_two = 2; power_of_two < 20; power_of_two += 2) { TestSize(1ull << power_of_two); } TestAdjacentDifferenceWithBigIndexes(); TestAdjacentDifferenceOpInvocationsNum(); return 0; } cub-2.0.1/test/test_device_histogram.cu000066400000000000000000002120261434614775400201700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceHistogram utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include "test_util.h" #define TEST_HALF_T \ (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA #if TEST_HALF_T #include #endif using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; bool g_verbose_input = false; bool g_verbose = false; int g_timing_iterations = 0; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Dispatch to different DeviceHistogram entrypoints //--------------------------------------------------------------------- template struct Dispatch; template struct Dispatch { /** * Dispatch to CUB multi histogram-range entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T /** * Dispatch to CUB multi histogram-range entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramRange( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram, num_levels, reinterpret_cast<__half *(&)[NUM_ACTIVE_CHANNELS]>(d_levels), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif /** * Dispatch to CUB multi histogram-even entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T /** * Dispatch to CUB multi histogram-even entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. half_t *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramEven( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram, num_levels, reinterpret_cast<__half*>(lower_level), reinterpret_cast<__half*>(upper_level), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif }; template <> struct Dispatch<1, 1, CUB> { /** * Dispatch to CUB single histogram-range entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram[0], num_levels[0], d_levels[0], num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramRange( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram[0], num_levels[0], d_levels[0].operator __half(), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif /** * Dispatch to CUB single histogram-even entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram[0], num_levels[0], lower_level[0], upper_level[0], num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. half_t *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram[0], num_levels[0], lower_level[0].operator __half(), upper_level[0].operator __half(), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif }; //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- // Searches for bin given a list of bin-boundary levels template struct SearchTransform { LevelT *levels; // Pointer to levels array int num_levels; // Number of levels in array // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range) template int operator()(SampleT sample) { int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1); if (bin < 0) { // Sample out of range return num_levels; } return bin; } }; // Scales samples to evenly-spaced bins template struct ScaleTransform { int num_levels; // Number of levels in array LevelT max; // Max sample level (exclusive) LevelT min; // Min sample level (inclusive) LevelT scale; // Bin scaling factor void Init( int num_levels_, // Number of levels in array LevelT max_, // Max sample level (exclusive) LevelT min_, // Min sample level (inclusive) LevelT scale_) // Bin scaling factor { this->num_levels = num_levels_; this->max = max_; this->min = min_; this->scale = scale_; } // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range) template int operator()(SampleT sample) { if ((sample < min) || (sample >= max)) { // Sample out of range return num_levels; } return (int) ((((LevelT) sample) - min) / scale); } }; // Scales samples to evenly-spaced bins template <> struct ScaleTransform { int num_levels; // Number of levels in array float max; // Max sample level (exclusive) float min; // Min sample level (inclusive) float scale; // Bin scaling factor void Init( int _num_levels, // Number of levels in array float _max, // Max sample level (exclusive) float _min, // Min sample level (inclusive) float _scale) // Bin scaling factor { this->num_levels = _num_levels; this->max = _max; this->min = _min; this->scale = 1.0f / _scale; } // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range) template int operator()(SampleT sample) { if ((sample < min) || (sample >= max)) { // Sample out of range return num_levels; } return (int) ((((float) sample) - min) * scale); } }; /** * Generate sample */ template void Sample(T &datum, LevelT max_level, int entropy_reduction) { unsigned int max = (unsigned int) -1; unsigned int bits; RandomBits(bits, entropy_reduction); float fraction = (float(bits) / max); datum = (T) (fraction * max_level); } /** * Initialize histogram samples */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename LevelT, typename SampleT, typename OffsetT> void InitializeSamples( LevelT max_level, int entropy_reduction, SampleT *h_samples, OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { // Initialize samples for (OffsetT row = 0; row < num_rows; ++row) { for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel) { for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { // Sample offset OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel; // Init sample value Sample(h_samples[offset], max_level, entropy_reduction); if (g_verbose_input) { if (channel > 0) printf(", "); std::cout << CoutCast(h_samples[offset]); } } } } } /** * Initialize histogram solutions */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename SampleIteratorT, typename TransformOp, typename OffsetT> void InitializeBins( SampleIteratorT h_samples, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. TransformOp transform_op[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. CounterT *h_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { using SampleT = cub::detail::value_t; // Init bins for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin) { h_histogram[CHANNEL][bin] = 0; } } // Initialize samples if (g_verbose_input) printf("Samples: \n"); for (OffsetT row = 0; row < num_rows; ++row) { for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel) { if (g_verbose_input) printf("["); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { // Sample offset OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel; // Update sample bin int bin = transform_op[channel](h_samples[offset]); if (g_verbose_input) printf(" (%d)", bin); fflush(stdout); if ((bin >= 0) && (bin < num_levels[channel] - 1)) { // valid bin h_histogram[channel][bin]++; } } if (g_verbose_input) printf("]"); } if (g_verbose_input) printf("\n\n"); } } /** * Test histogram-even */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT, typename SampleIteratorT> void TestEven( LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest SampleIteratorT h_samples, SampleIteratorT d_samples) { OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); printf("\n----------------------------\n"); printf("%s cub::DeviceHistogram::Even (%s) " "%d pixels (%d height, %d width, %d-byte row stride), " "%d %d-byte %s samples (entropy reduction %d), " "%s levels, %s counters, %d/%d channels, max sample ", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_pointer::value) ? "pointer" : "iterator", (int) (num_row_pixels * num_rows), (int) num_rows, (int) num_row_pixels, (int) row_stride_bytes, (int) total_samples, (int) sizeof(SampleT), typeid(SampleT).name(), entropy_reduction, typeid(LevelT).name(), typeid(CounterT).name(), NUM_ACTIVE_CHANNELS, NUM_CHANNELS); std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { std::cout << "\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins " << "[" << lower_level[channel] << ", " << upper_level[channel] << ")\n"; } fflush(stdout); // Allocate and initialize host and device data typedef SampleT Foo; // rename type to quelch gcc warnings (bug?) CounterT* h_histogram[NUM_ACTIVE_CHANNELS]; ScaleTransform transform_op[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { int bins = num_levels[channel] - 1; h_histogram[channel] = new CounterT[bins]; transform_op[channel].Init( num_levels[channel], upper_level[channel], lower_level[channel], static_cast(((upper_level[channel] - lower_level[channel]) / static_cast(bins)))); } InitializeBins( h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes); // Allocate and initialize device data CounterT* d_histogram[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1))); CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1))); } // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; Dispatch::Even( 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); // Allocate temporary storage with "canary" zones int canary_bytes = 256; char canary_token = 8; char* canary_zone = new char[canary_bytes]; memset(canary_zone, canary_token, canary_bytes); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2))); CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2))); // Run warmup/correctness iteration Dispatch::Even( 1, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); // Check canary zones if (g_verbose) { printf("Checking leading temp_storage canary zone (token = %d)\n" "------------------------------------------------------\n", static_cast(canary_token)); } int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose); AssertEquals(0, error); if (g_verbose) { printf("Checking trailing temp_storage canary zone (token = %d)\n" "-------------------------------------------------------\n", static_cast(canary_token)); } error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose); AssertEquals(0, error); // Flush any stdout/stderr CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); fflush(stdout); fflush(stderr); // Check for correctness (and display results, if specified) for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { if (g_verbose) { printf("Checking histogram result (channel = %d)\n" "----------------------------------------\n", channel); } int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose); printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n"); error |= channel_error; } // Performance GpuTimer gpu_timer; gpu_timer.Start(); Dispatch::Even( g_timing_iterations, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * sizeof(SampleT); printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s", avg_millis, giga_rate, giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS, giga_rate / NUM_CHANNELS, giga_bandwidth); } printf("\n\n"); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { if (h_histogram[channel]) delete[] h_histogram[channel]; if (d_histogram[channel]) CubDebugExit(g_allocator.DeviceFree(d_histogram[channel])); } if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, error); } /** * Test histogram-even (native pointer input) */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT> void TestEvenNative( LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); // Allocate and initialize host sample data typedef SampleT Foo; // rename type to quelch gcc warnings (bug?) SampleT* h_samples = new Foo[total_samples]; InitializeSamples( max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes); // Allocate and initialize device data SampleT* d_samples = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples)); CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice)); TestEven( max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes, h_samples, d_samples); // Cleanup if (h_samples) delete[] h_samples; if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples)); } /** * Test histogram-even (iterator input) */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT> void TestEvenIterator( Int2Type /*is_half*/, LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { SampleT sample = (SampleT) lower_level[0]; ConstantInputIterator sample_itr(sample); TestEven( max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes, sample_itr, sample_itr); } template void TestEvenIterator(Int2Type /*is_half*/, LevelT, int, int[NUM_ACTIVE_CHANNELS], LevelT[NUM_ACTIVE_CHANNELS], LevelT[NUM_ACTIVE_CHANNELS], OffsetT, OffsetT, OffsetT) { // We have to reinterpret cast `half_t *` pointer to `__half *` in this test. // Hence, iterators testing is not supported. } /** * Test histogram-range */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT> void TestRange( LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); printf("\n----------------------------\n"); printf("%s cub::DeviceHistogram::Range %d pixels " "(%d height, %d width, %d-byte row stride), " "%d %d-byte %s samples (entropy reduction %d), " "%s levels, %s counters, %d/%d channels, max sample ", (BACKEND == CDP) ? "CDP CUB" : "CUB", (int)(num_row_pixels * num_rows), (int)num_rows, (int)num_row_pixels, (int)row_stride_bytes, (int)total_samples, (int)sizeof(SampleT), typeid(SampleT).name(), entropy_reduction, typeid(LevelT).name(), typeid(CounterT).name(), NUM_ACTIVE_CHANNELS, NUM_CHANNELS); std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { printf("Channel %d: %d bins", channel, num_levels[channel] - 1); if (g_verbose) { std::cout << "[ " << levels[channel][0]; for (int level = 1; level < num_levels[channel]; ++level) { std::cout << ", " << levels[channel][level]; } printf("]"); } printf("\n"); } fflush(stdout); // Allocate and initialize host and device data typedef SampleT Foo; // rename type to quelch gcc warnings (bug?) SampleT* h_samples = new Foo[total_samples]; CounterT* h_histogram[NUM_ACTIVE_CHANNELS]; SearchTransform transform_op[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { transform_op[channel].levels = levels[channel]; transform_op[channel].num_levels = num_levels[channel]; int bins = num_levels[channel] - 1; h_histogram[channel] = new CounterT[bins]; } InitializeSamples( max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes); InitializeBins( h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes); // Allocate and initialize device data SampleT* d_samples = NULL; LevelT* d_levels[NUM_ACTIVE_CHANNELS]; CounterT* d_histogram[NUM_ACTIVE_CHANNELS]; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples)); CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice)); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel])); CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice)); int bins = num_levels[channel] - 1; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins)); CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins)); } // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; Dispatch::Range( 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); // Allocate temporary storage with "canary" zones int canary_bytes = 256; char canary_token = 9; char* canary_zone = new char[canary_bytes]; memset(canary_zone, canary_token, canary_bytes); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2))); CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2))); // Run warmup/correctness iteration Dispatch::Range( 1, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); // Check canary zones int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose); AssertEquals(0, error); error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose); AssertEquals(0, error); // Flush any stdout/stderr CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); fflush(stdout); fflush(stderr); // Check for correctness (and display results, if specified) for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose); printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n"); error |= channel_error; } // Performance GpuTimer gpu_timer; gpu_timer.Start(); Dispatch::Range( g_timing_iterations, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * sizeof(SampleT); printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s", avg_millis, giga_rate, giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS, giga_rate / NUM_CHANNELS, giga_bandwidth); } printf("\n\n"); // Cleanup if (h_samples) delete[] h_samples; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { if (h_histogram[channel]) delete[] h_histogram[channel]; if (d_histogram[channel]) CubDebugExit(g_allocator.DeviceFree(d_histogram[channel])); if (d_levels[channel]) CubDebugExit(g_allocator.DeviceFree(d_levels[channel])); } if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, error); } /** * Test histogram-even */ template < Backend BACKEND, typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void TestEven( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], LevelT max_level, int max_num_levels) { LevelT lower_level[NUM_ACTIVE_CHANNELS]; LevelT upper_level[NUM_ACTIVE_CHANNELS]; // Find smallest level increment int max_bins = max_num_levels - 1; LevelT min_level_increment = max_level / static_cast(max_bins); // Set upper and lower levels for each channel for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { int num_bins = num_levels[channel] - 1; lower_level[channel] = static_cast((max_level - (static_cast(num_bins) * min_level_increment)) / static_cast(2)); upper_level[channel] = static_cast((max_level + (static_cast(num_bins) * min_level_increment)) / static_cast(2)); } // Test pointer-based samples TestEvenNative( max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); // Test iterator-based samples (CUB-only) TestEvenIterator( Int2Type::value>{}, max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); } /** * Test histogram-range */ template < Backend BACKEND, typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void TestRange( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], LevelT max_level, int max_num_levels) { // Find smallest level increment int max_bins = max_num_levels - 1; LevelT min_level_increment = max_level / static_cast(max_bins); LevelT* levels[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { levels[channel] = new LevelT[num_levels[channel]]; int num_bins = num_levels[channel] - 1; LevelT lower_level = (max_level - static_cast(num_bins * min_level_increment)) / static_cast(2); for (int level = 0; level < num_levels[channel]; ++level) levels[channel][level] = lower_level + static_cast(level * min_level_increment); } TestRange( max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) delete[] levels[channel]; } /** * Test different entrypoints */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], LevelT max_level, int max_num_levels) { TestEven( num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); TestRange( num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); } /** * Test different number of levels */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, LevelT max_level, int max_num_levels) { int num_levels[NUM_ACTIVE_CHANNELS]; // All different levels num_levels[0] = max_num_levels; for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel) { num_levels[channel] = (num_levels[channel - 1] / 2) + 1; } Test( num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); } /** * Test different entropy-levels */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, LevelT max_level, int max_num_levels) { // entropy_reduction = -1 -> all samples == 0 Test( num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels); Test( num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels); Test( num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels); } /** * Test different row strides */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, LevelT max_level, int max_num_levels) { OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT); // No padding Test( num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels); // 13 samples padding Test( num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels); } /** * Test different problem sizes */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( LevelT max_level, int max_num_levels) { // 0 row/col images Test( OffsetT(1920), OffsetT(0), max_level, max_num_levels); Test( OffsetT(0), OffsetT(0), max_level, max_num_levels); // Small inputs Test( OffsetT(15), OffsetT(1), max_level, max_num_levels); // 1080 image Test( OffsetT(1920), OffsetT(1080), max_level, max_num_levels); // Sample different aspect ratios sizes for (OffsetT rows = 1; rows < 1000000; rows *= 1000) { for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000) { Test( cols, rows, max_level, max_num_levels); } } } /** * Test different channel interleavings (valid specialiation) */ template void TestChannels(LevelT max_level, int max_num_levels, Int2Type /*is_valid_tag*/, Int2Type /*test_extra_channels*/) { Test(max_level, max_num_levels); Test(max_level, max_num_levels); } template void TestChannels(LevelT max_level, int max_num_levels, Int2Type /*is_valid_tag*/, Int2Type /*test_extra_channels*/) { Test(max_level, max_num_levels); Test(max_level, max_num_levels); Test(max_level, max_num_levels); Test(max_level, max_num_levels); } template void TestChannels(LevelT /*max_level*/, int /*max_num_levels*/, Int2Type /*is_valid_tag*/, TestExtraChannels) {} void TestLevelsAliasing() { constexpr int num_levels = 7; int h_histogram[num_levels - 1]{}; int h_samples[]{ 0, 2, 4, 6, 8, 10, 12, // levels 1, // bin 0 3, 3, // bin 1 5, 5, 5, // bin 2 7, 7, 7, 7, // bin 3 9, 9, 9, 9, 9, // bin 4 11, 11, 11, 11, 11, 11 // bin 5 }; constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]); int *d_histogram{}; int *d_samples{}; CubDebugExit( g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram))); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples))); CubDebugExit( cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice)); // Alias levels with samples (fancy way to `d_histogram[bin]++`). int *d_levels = d_samples; std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples)); CubDebugExit(cudaMemcpy(h_histogram, d_histogram, sizeof(h_histogram), cudaMemcpyDeviceToHost)); for (int bin = 0; bin < num_levels - 1; bin++) { // Each bin should contain `bin + 1` samples. Since samples also contain // levels, they contribute one extra item to each bin. AssertEquals(bin + 2, h_histogram[bin]); } CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); CubDebugExit(g_allocator.DeviceFree(d_histogram)); CubDebugExit(g_allocator.DeviceFree(d_levels)); } // Regression test for NVIDIA/cub#489: integer rounding errors lead to incorrect // bin detection: void TestIntegerBinCalcs() { constexpr int num_levels = 8; constexpr int num_bins = num_levels - 1; int h_histogram[num_bins]{}; const int h_histogram_ref[num_bins]{1, 5, 0, 2, 1, 0, 0}; const int h_samples[]{2, 6, 7, 2, 3, 0, 2, 2, 6, 999}; const int lower_level = 0; const int upper_level = 12; constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]); int *d_histogram{}; int *d_samples{}; CubDebugExit( g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram))); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples))); CubDebugExit( cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice)); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples)); CubDebugExit(cudaMemcpy(h_histogram, d_histogram, sizeof(h_histogram), cudaMemcpyDeviceToHost)); for (int bin = 0; bin < num_bins; ++bin) { AssertEquals(h_histogram_ref[bin], h_histogram[bin]); } CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); CubDebugExit(g_allocator.DeviceFree(d_histogram)); CubDebugExit(g_allocator.DeviceFree(d_samples)); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_verbose_input = args.CheckCmdLineFlag("v2"); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--i=] " "[--device=] " "[--v] " "[--v2] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); using true_t = Int2Type; using false_t = Int2Type; TestLevelsAliasing(); TestIntegerBinCalcs(); // regression test for NVIDIA/cub#489 #if TEST_HALF_T TestChannels(256, 256 + 1, true_t{}, true_t{}); #endif TestChannels (256, 256 + 1, true_t{}, true_t{}); TestChannels (8192, 8192 + 1, true_t{}, false_t{}); #if !defined(__ICC) // Fails with ICC for unknown reasons, see #332. TestChannels (1.0, 256 + 1, true_t{}, false_t{}); #endif // float samples, int levels, regression test for NVIDIA/cub#479. TestChannels (12, 7, true_t{}, true_t{}); // Test down-conversion of size_t offsets to int TestChannels (256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{}); return 0; } cub-2.0.1/test/test_device_merge_sort.cu000066400000000000000000000255071434614775400203470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceMergeSort utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" #include #include #include // for std::bad_alloc #include #include using namespace cub; struct CustomLess { template __device__ bool operator()(DataType &lhs, DataType &rhs) { return lhs < rhs; } }; template bool CheckResult(thrust::device_vector &d_data) { const bool is_sorted = thrust::is_sorted(d_data.begin(), d_data.end(), CustomLess()); return is_sorted; } template struct ValueToKey { __device__ __host__ KeyType operator()(const ValueType &val) { return val; } }; template struct ValueToKey { __device__ __host__ HugeDataType operator()(const ValueType &val) { return HugeDataType(val); } }; template void Test(std::int64_t num_items, thrust::default_random_engine &rng, thrust::device_vector &d_keys, thrust::device_vector &d_values) { thrust::sequence(d_values.begin(), d_values.end()); thrust::shuffle(d_values.begin(), d_values.end(), rng); thrust::transform(d_values.begin(), d_values.end(), d_keys.begin(), ValueToKey()); thrust::device_vector d_keys_before_sort(d_keys); thrust::device_vector d_values_before_sort(d_values); thrust::device_vector d_keys_before_sort_copy(d_keys); thrust::device_vector d_values_before_sort_copy(d_values); size_t temp_size = 0; CubDebugExit(cub::DeviceMergeSort::SortPairs( nullptr, temp_size, thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess())); thrust::device_vector tmp(temp_size); CubDebugExit(cub::DeviceMergeSort::SortPairs( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess())); thrust::device_vector d_keys_after_sort_copy(d_keys); thrust::device_vector d_values_after_sort_copy(d_values); AssertTrue(CheckResult(d_values)); CubDebugExit(cub::DeviceMergeSort::SortPairsCopy( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::raw_pointer_cast(d_keys_before_sort.data()), thrust::raw_pointer_cast(d_values_before_sort.data()), thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess())); AssertEquals(d_keys, d_keys_after_sort_copy); AssertEquals(d_values, d_values_after_sort_copy); AssertEquals(d_keys_before_sort, d_keys_before_sort_copy); AssertEquals(d_values_before_sort, d_values_before_sort_copy); // At the moment stable sort is an alias to sort, so it's safe to use // temp_size storage allocated before CubDebugExit(cub::DeviceMergeSort::StableSortPairs( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess())); AssertTrue(CheckResult(d_values)); CubDebugExit(cub::DeviceMergeSort::SortPairsCopy( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::constant_iterator(KeyType(42)), thrust::counting_iterator(DataType(0)), thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess())); thrust::sequence(d_values_before_sort.begin(), d_values_before_sort.end()); AssertEquals(d_values, d_values_before_sort); } template void TestKeys(std::int64_t num_items, thrust::default_random_engine &rng, thrust::device_vector &d_keys, thrust::device_vector &d_values) { thrust::sequence(d_values.begin(), d_values.end()); thrust::shuffle(d_values.begin(), d_values.end(), rng); thrust::transform(d_values.begin(), d_values.end(), d_keys.begin(), ValueToKey()); thrust::device_vector d_before_sort(d_keys); thrust::device_vector d_before_sort_copy(d_keys); size_t temp_size = 0; cub::DeviceMergeSort::SortKeys( nullptr, temp_size, thrust::raw_pointer_cast(d_keys.data()), num_items, CustomLess()); thrust::device_vector tmp(temp_size); CubDebugExit(cub::DeviceMergeSort::SortKeys( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::raw_pointer_cast(d_keys.data()), num_items, CustomLess())); thrust::device_vector d_after_sort(d_keys); AssertTrue(CheckResult(d_keys)); CubDebugExit(cub::DeviceMergeSort::SortKeysCopy( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::raw_pointer_cast(d_before_sort.data()), thrust::raw_pointer_cast(d_keys.data()), num_items, CustomLess())); AssertTrue(d_keys == d_after_sort); AssertTrue(d_before_sort == d_before_sort_copy); // At the moment stable sort is an alias to sort, so it's safe to use // temp_size storage allocated before CubDebugExit(cub::DeviceMergeSort::StableSortKeys( thrust::raw_pointer_cast(tmp.data()), temp_size, thrust::raw_pointer_cast(d_keys.data()), num_items, CustomLess())); AssertTrue(CheckResult(d_keys)); } template struct TestHelper { template static void AllocateAndTest(thrust::default_random_engine &rng, unsigned int num_items) { thrust::device_vector d_keys(num_items); thrust::device_vector d_values(num_items); Test(num_items, rng, d_keys, d_values); TestKeys(num_items, rng, d_keys, d_values); } }; template <> struct TestHelper { template static void AllocateAndTest(thrust::default_random_engine &, unsigned int) {} }; template void Test(thrust::default_random_engine &rng, unsigned int num_items) { TestHelper::template AllocateAndTest(rng, num_items); TestHelper::template AllocateAndTest(rng, num_items); TestHelper::template AllocateAndTest(rng, num_items); } template void AllocateAndTestIterators(unsigned int num_items) { thrust::device_vector d_keys(num_items); thrust::device_vector d_values(num_items); thrust::sequence(d_keys.begin(), d_keys.end()); thrust::sequence(d_values.begin(), d_values.end()); thrust::reverse(d_values.begin(), d_values.end()); using KeyIterator = typename thrust::device_vector::iterator; thrust::reverse_iterator reverse_iter(d_keys.end()); size_t temp_size = 0; cub::DeviceMergeSort::SortPairs( nullptr, temp_size, reverse_iter, thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess()); thrust::device_vector tmp(temp_size); cub::DeviceMergeSort::SortPairs( thrust::raw_pointer_cast(tmp.data()), temp_size, reverse_iter, thrust::raw_pointer_cast(d_values.data()), num_items, CustomLess()); AssertTrue(CheckResult(d_values)); } template void Test(thrust::default_random_engine &rng) { for (unsigned int pow2 = 9; pow2 < 22; pow2 += 2) { try { const unsigned int num_items = 1 << pow2; AllocateAndTestIterators(num_items); TestHelper::AllocateAndTest(rng, num_items); Test(rng, num_items); } catch (std::bad_alloc &e) { if (pow2 > 20) { // Some cards don't have enough memory for large allocations, these // can be skipped. printf("Skipping large memory test. (num_items=2^%u): %s\n", pow2, e.what()); } else { // For smaller problem sizes, treat as an error: printf("Error (num_items=2^%u): %s", pow2, e.what()); throw; } } } } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); thrust::default_random_engine rng; Test(rng); Test(rng); return 0; } cub-2.0.1/test/test_device_radix_sort.cu000066400000000000000000002213371434614775400203560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceRadixSort utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA #include #endif #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA #include #endif #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; std::size_t g_smallest_pre_sorted_num_items = (std::size_t(1) << 32) - 42; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method (allows overwriting of input) CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input) CUB_SEGMENTED, // CUB method (allows overwriting of input) CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input) // Same as above, but launches kernels from device using CDP. CDP, CDP_NO_OVERWRITE, CDP_SEGMENTED, CDP_SEGMENTED_NO_OVERWRITE, }; static const char* BackendToString(Backend b) { switch (b) { case CUB: return "CUB"; case CUB_NO_OVERWRITE: return "CUB_NO_OVERWRITE"; case CUB_SEGMENTED: return "CUB_SEGMENTED"; case CUB_SEGMENTED_NO_OVERWRITE: return "CUB_SEGMENTED_NO_OVERWRITE"; case CDP: return "CDP"; case CDP_NO_OVERWRITE: return "CDP_NO_OVERWRITE"; case CDP_SEGMENTED: return "CDP_SEGMENTED"; case CDP_SEGMENTED_NO_OVERWRITE: return "CDP_SEGMENTED_NO_OVERWRITE"; default: break; } return ""; } //--------------------------------------------------------------------- // Dispatch to different DeviceRadixSort entrypoints //--------------------------------------------------------------------- /** * Dispatch to CUB sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { return DeviceRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit); } /** * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), num_items, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } /** * Dispatch to CUB sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { return DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit); } /** * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), num_items, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } //--------------------------------------------------------------------- // Dispatch to different DeviceRadixSort entrypoints //--------------------------------------------------------------------- // Validates that `num_items` fits into `int` // TODO(canonizer): remove this check once num_items is templated for segmented sort. template __host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items) { if (static_cast(num_items) < static_cast(INT_MAX)) { return true; } else { printf("cub::DeviceSegmentedRadixSort is currently limited by %d items but " "%lld were provided\n", INT_MAX, static_cast(num_items)); } return false; } /** * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { return DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); } return cudaErrorInvalidValue; } /** * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } return cudaErrorInvalidValue; } /** * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { return DeviceSegmentedRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); } return cudaErrorInvalidValue; } /** * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } return cudaErrorInvalidValue; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceRadixSort */ template __global__ void CDPDispatchKernel(Int2Type is_descending, Int2Type cub_backend, int *d_selector, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, DoubleBuffer d_keys, DoubleBuffer d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { *d_cdp_error = Dispatch(is_descending, cub_backend, d_selector, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); *d_temp_storage_bytes = temp_storage_bytes; *d_selector = d_keys.selector; } /** * Launch kernel and dispatch on device. Should only be called from host code. * The CubBackend should be one of the non-CDP CUB backends to invoke from the * device. */ template cudaError_t LaunchCDPKernel(Int2Type is_descending, Int2Type cub_backend, int *d_selector, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { // Invoke kernel to invoke device-side dispatch: cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, is_descending, cub_backend, d_selector, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); CubDebugExit(retval); CubDebugExit(cub::detail::device_synchronize()); // Copy out selector CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost)); d_values.selector = d_keys.selector; // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } // Specializations of Dispatch that translate the CDP backend to the appropriate // CUB backend, and uses the CUB backend to launch the CDP kernel. #define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \ template \ cudaError_t Dispatch(Int2Type is_descending, \ Int2Type /*dispatch_to*/, \ int *d_selector, \ size_t *d_temp_storage_bytes, \ cudaError_t *d_cdp_error, \ \ void *d_temp_storage, \ size_t &temp_storage_bytes, \ DoubleBuffer &d_keys, \ DoubleBuffer &d_values, \ NumItemsT num_items, \ int num_segments, \ BeginOffsetIteratorT d_segment_begin_offsets, \ EndOffsetIteratorT d_segment_end_offsets, \ int begin_bit, \ int end_bit) \ { \ Int2Type cub_backend{}; \ return LaunchCDPKernel(is_descending, \ cub_backend, \ d_selector, \ d_temp_storage_bytes, \ d_cdp_error, \ d_temp_storage, \ temp_storage_bytes, \ d_keys, \ d_values, \ num_items, \ num_segments, \ d_segment_begin_offsets, \ d_segment_end_offsets, \ begin_bit, \ end_bit); \ } DEFINE_CDP_DISPATCHER(CDP, CUB) DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE) DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED) DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE) #undef DEFINE_CDP_DISPATCHER #endif // TEST_CDP //--------------------------------------------------------------------- // Problem generation //--------------------------------------------------------------------- /** * Simple key-value pairing */ template < typename KeyT, typename ValueT> struct Pair { KeyT key; ValueT value; bool operator<(const Pair &b) const { return (key < b.key); } }; /** * Simple key-value pairing (specialized for bool types) */ template struct Pair { bool key; ValueT value; bool operator<(const Pair &b) const { return (!key && b.key); } }; /** * Initialize key data */ template void InitializeKeyBits( GenMode gen_mode, KeyT *h_keys, NumItemsT num_items, int /*entropy_reduction*/) { for (NumItemsT i = 0; i < num_items; ++i) InitValue(gen_mode, h_keys[i], i); } template ::UnsignedBits> UnsignedBits KeyBits(KeyT key) { UnsignedBits bits; memcpy(&bits, &key, sizeof(KeyT)); return bits; } /** Initialize the reference array monotonically. */ template void InitializeKeysSorted( KeyT *h_keys, NumItemsT num_items) { using TraitsT = cub::Traits; using UnsignedBits = typename TraitsT::UnsignedBits; // Numbers to generate random runs. UnsignedBits max_inc = 1 << (sizeof(UnsignedBits) < 4 ? 3 : (sizeof(UnsignedBits) < 8 ? 14 : 24)); UnsignedBits min_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Lowest())); UnsignedBits max_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Max())); NumItemsT max_run = std::max( NumItemsT(double(num_items) * (max_inc + 1) / max_bits), NumItemsT(1 << 14)); UnsignedBits *h_key_bits = reinterpret_cast(h_keys); NumItemsT i = 0; // Start with the minimum twiddled key. UnsignedBits twiddled_key = min_bits; while (i < num_items) { // Generate random increment (avoid overflow). UnsignedBits inc_bits = 0; RandomBits(inc_bits); // twiddled_key < max_bits at this point. UnsignedBits inc = static_cast(std::min(1 + inc_bits % max_inc, max_bits - twiddled_key)); twiddled_key += inc; // Generate random run length (ensure there are enough values to fill the rest). NumItemsT run_bits = 0; RandomBits(run_bits); NumItemsT run_length = std::min(1 + run_bits % max_run, num_items - i); if (twiddled_key == max_bits) run_length = num_items - i; NumItemsT run_end = i + run_length; // Fill the array. UnsignedBits key = TraitsT::TwiddleOut(twiddled_key); // Avoid -0.0 for floating-point keys. UnsignedBits negative_zero = UnsignedBits(1) << UnsignedBits(sizeof(UnsignedBits) * 8 - 1); if (TraitsT::CATEGORY == cub::FLOATING_POINT && key == negative_zero) { key = 0; } for (; i < run_end; ++i) { h_key_bits[i] = key; } } } /** * Initialize solution */ template void InitializeSolution( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, int begin_bit, int end_bit, NumItemsT *&h_reference_ranks, KeyT *&h_reference_keys) { if (num_items == 0) { h_reference_ranks = nullptr; h_reference_keys = nullptr; return; } if (pre_sorted) { printf("Shuffling reference solution on CPU\n"); // Note: begin_bit and end_bit are ignored here, and assumed to have the // default values (begin_bit == 0, end_bit == 8 * sizeof(KeyT)). // Otherwise, pre-sorting won't work, as it doesn't necessarily // correspond to the order of keys sorted by a subrange of bits. // num_segments is also ignored as assumed to be 1, as pre-sorted tests // are currently not supported for multiple segments. // // Pre-sorted tests with non-default begin_bit, end_bit or num_segments // != 1 are skipped in TestBits() and TestSegments(), respectively. AssertEquals(begin_bit, 0); AssertEquals(end_bit, static_cast(8 * sizeof(KeyT))); AssertEquals(num_segments, 1); // Copy to the reference solution. h_reference_keys = new KeyT[num_items]; if (IS_DESCENDING) { // Copy in reverse. for (NumItemsT i = 0; i < num_items; ++i) { h_reference_keys[i] = h_keys[num_items - 1 - i]; } // Copy back. memcpy(h_keys, h_reference_keys, num_items * sizeof(KeyT)); } else { memcpy(h_reference_keys, h_keys, num_items * sizeof(KeyT)); } // Summarize the pre-sorted array (element, 1st position, count). struct Element { KeyT key; NumItemsT num; NumItemsT index; }; std::vector summary; KeyT cur_key = h_reference_keys[0]; summary.push_back(Element{cur_key, 1, 0}); for (NumItemsT i = 1; i < num_items; ++i) { KeyT key = h_reference_keys[i]; if (key == cur_key) { // Same key. summary.back().num++; continue; } // Different key. cur_key = key; summary.push_back(Element{cur_key, 1, i}); } // Generate a random permutation from the summary. Such a complicated // approach is used to permute the array and compute ranks in a // cache-friendly way and in a short time. if (WANT_RANKS) { h_reference_ranks = new NumItemsT[num_items]; } NumItemsT max_run = 32; NumItemsT run = 0; NumItemsT i = 0; while (summary.size() > 0) { // Pick up a random element and a run. NumItemsT bits = 0; RandomBits(bits); NumItemsT summary_id = bits % summary.size(); Element& element = summary[summary_id]; run = std::min(1 + bits % (max_run - 1), element.num); for (NumItemsT j = 0; j < run; ++j) { h_keys[i + j] = element.key; if (WANT_RANKS) { h_reference_ranks[element.index + j] = i + j; } } i += run; element.index += run; element.num -= run; if (element.num == 0) { // Remove the empty entry. std::swap(summary[summary_id], summary.back()); summary.pop_back(); } } printf(" Done.\n"); } else { typedef Pair PairT; PairT *h_pairs = new PairT[num_items]; int num_bits = end_bit - begin_bit; for (NumItemsT i = 0; i < num_items; ++i) { // Mask off unwanted portions if (num_bits < static_cast(sizeof(KeyT) * 8)) { unsigned long long base = 0; memcpy(&base, &h_keys[i], sizeof(KeyT)); base &= ((1ull << num_bits) - 1) << begin_bit; memcpy(&h_pairs[i].key, &base, sizeof(KeyT)); } else { h_pairs[i].key = h_keys[i]; } h_pairs[i].value = i; } printf("\nSorting reference solution on CPU " "(%zd items, %d segments, %zd items/seg)...", static_cast(num_items), num_segments, static_cast(num_items / num_segments)); fflush(stdout); for (int i = 0; i < num_segments; ++i) { if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]); std::stable_sort( h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]); if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]); } printf(" Done.\n"); fflush(stdout); if (WANT_RANKS) { h_reference_ranks = new NumItemsT[num_items]; } h_reference_keys = new KeyT[num_items]; for (NumItemsT i = 0; i < num_items; ++i) { if (WANT_RANKS) { h_reference_ranks[i] = h_pairs[i].value; } h_reference_keys[i] = h_keys[h_pairs[i].value]; } if (h_pairs) delete[] h_pairs; } } template void ResetKeys(KeyT *h_keys, NumItemsT num_items, bool pre_sorted, KeyT *reference_keys) { if (!pre_sorted) return; // Copy the reference keys back. if (IS_DESCENDING) { // Keys need to be copied in reverse. for (NumItemsT i = 0; i < num_items; ++i) { h_keys[i] = reference_keys[num_items - 1 - i]; } } else { memcpy(h_keys, reference_keys, num_items * sizeof(KeyT)); } } //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- template struct UnwrapHalfAndBfloat16 { using Type = T; }; #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA template <> struct UnwrapHalfAndBfloat16 { using Type = __half; }; #endif #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA template <> struct UnwrapHalfAndBfloat16 { using Type = __nv_bfloat16; }; #endif /** * Test DeviceRadixSort */ template < Backend BACKEND, bool IS_DESCENDING, typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT> void Test( KeyT *h_keys, ValueT *h_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit, KeyT *h_reference_keys, ValueT *h_reference_values) { // Key alias type using KeyAliasT = typename UnwrapHalfAndBfloat16::Type; const bool KEYS_ONLY = std::is_same::value; printf("%s %s cub::DeviceRadixSort %zd items, %d segments, " "%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), " "descending %d, begin_bit %d, end_bit %d\n", BackendToString(BACKEND), (KEYS_ONLY) ? "keys-only" : "key-value", static_cast(num_items), num_segments, static_cast(sizeof(KeyT)), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : static_cast(sizeof(ValueT)), typeid(ValueT).name(), static_cast(sizeof(NumItemsT)), typeid(NumItemsT).name(), IS_DESCENDING, begin_bit, end_bit); if (g_verbose) { printf("Input keys:\n"); DisplayResults(h_keys, num_items); printf("\n\n"); } // Allocate device arrays DoubleBuffer d_keys; DoubleBuffer d_values; int *d_selector; size_t *d_temp_storage_bytes; cudaError_t *d_cdp_error; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); if (!KEYS_ONLY) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items)); } // Allocate temporary storage (and make it un-aligned) size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; CubDebugExit(Dispatch( Int2Type(), Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1)); void* mis_aligned_temp = static_cast(d_temp_storage) + 1; // Initialize/clear device arrays d_keys.selector = 0; CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items)); if (!KEYS_ONLY) { d_values.selector = 0; CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items)); } // Run warmup/correctness iteration CubDebugExit(Dispatch( Int2Type(), Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, mis_aligned_temp, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit)); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Check for correctness (and display results, if specified) printf("Warmup done. Checking results:\n"); fflush(stdout); int compare = CompareDeviceResults(h_reference_keys, reinterpret_cast(d_keys.Current()), num_items, true, g_verbose); printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout); if (!KEYS_ONLY) { int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose); compare |= values_compare; printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout); } if (BACKEND == CUB_NO_OVERWRITE) { // Check that input isn't overwritten int input_compare = CompareDeviceResults(h_keys, reinterpret_cast(d_keys.d_buffers[0]), num_items, true, g_verbose); compare |= input_compare; printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout); } // Performance if (g_timing_iterations) printf("\nPerforming timing iterations:\n"); fflush(stdout); GpuTimer gpu_timer; float elapsed_millis = 0.0f; for (int i = 0; i < g_timing_iterations; ++i) { // Initialize/clear device arrays CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items)); if (!KEYS_ONLY) { CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items)); } gpu_timer.Start(); CubDebugExit(Dispatch( Int2Type(), Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, mis_aligned_temp, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit)); gpu_timer.Stop(); elapsed_millis += gpu_timer.ElapsedMillis(); } // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = (KEYS_ONLY) ? giga_rate * sizeof(KeyT) * 2 : giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2; printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth); } printf("\n\n"); // Cleanup if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0])); if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1])); if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0])); if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1])); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); // Correctness asserts AssertEquals(0, compare); } // Returns whether there is enough memory for the test. template bool HasEnoughMemory(std::size_t num_items, bool overwrite) { std::size_t total_mem = TotalGlobalMem(); std::size_t value_size = std::is_same::value ? 0 : sizeof(ValueT); // A conservative estimate of the amount of memory required. double factor = overwrite ? 2.25 : 3.25; std::size_t test_mem = static_cast (num_items * (sizeof(KeyT) + value_size) * factor); return test_mem < total_mem; } /** * Test backend */ template void TestBackend(KeyT *h_keys, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit, KeyT *h_reference_keys, NumItemsT *h_reference_ranks) { #if TEST_CDP == 0 constexpr auto NonSegmentedOverwrite = CUB; constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE; constexpr auto SegmentedOverwrite = CUB_SEGMENTED; constexpr auto SegmentedNoOverwrite = CUB_SEGMENTED_NO_OVERWRITE; #else // TEST_CDP constexpr auto NonSegmentedOverwrite = CDP; constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE; constexpr auto SegmentedOverwrite = CDP_SEGMENTED; constexpr auto SegmentedNoOverwrite = CDP_SEGMENTED_NO_OVERWRITE; #endif // TEST_CDP const bool KEYS_ONLY = std::is_same::value; // A conservative check assuming overwrite is allowed. if (!HasEnoughMemory(static_cast(num_items), true)) { printf("Skipping the test due to insufficient device memory\n"); return; } std::unique_ptr h_value_data{}; ValueT *h_values = nullptr; ValueT *h_reference_values = nullptr; if (!KEYS_ONLY) { h_value_data.reset(new ValueT[2 * static_cast(num_items)]); h_values = h_value_data.get(); h_reference_values = h_value_data.get() + num_items; for (NumItemsT i = 0; i < num_items; ++i) { InitValue(INTEGER_SEED, h_values[i], i); InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]); } } // Skip segmented sort if num_items isn't int. // TODO(64bit-seg-sort): re-enable these tests once num_items is templated for // segmented sort. if (std::is_same::value) { printf("Testing segmented sort with overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); printf("Testing segmented sort with no overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); } else { printf("Skipping segmented sort tests (NumItemsT != int)\n"); } if (num_segments == 1) { printf("Testing non-segmented sort with overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); if (HasEnoughMemory(static_cast(num_items), false)) { printf("Testing non-segmented sort with no overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); } else { printf("Skipping no-overwrite tests with %zd items due to " "insufficient memory\n", static_cast(num_items)); } } } // Smallest value type for TEST_VALUE_TYPE. // Unless TEST_VALUE_TYPE == 3, this is the only value type tested. #if TEST_VALUE_TYPE == 0 // Test keys-only using SmallestValueT = NullType; #elif TEST_VALUE_TYPE == 1 // Test with 8b value using SmallestValueT = unsigned char; #elif TEST_VALUE_TYPE == 2 // Test with 32b value using SmallestValueT = unsigned int; // Test with 64b value #elif TEST_VALUE_TYPE == 3 using SmallestValueT = unsigned long long; #endif /** * Test value type */ template void TestValueTypes( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { // Initialize the solution NumItemsT *h_reference_ranks = NULL; KeyT *h_reference_keys = NULL; // If TEST_VALUE_TYPE == 0, no values are sorted, only keys. // Since ranks are only necessary when checking for values, // they are not computed in this case. InitializeSolution(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys); TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); #if TEST_VALUE_TYPE == 3 // Test with non-trivially-constructable value // These are cheap to build, so lump them in with the 64b value tests. TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); #endif // Cleanup ResetKeys(h_keys, num_items, pre_sorted, h_reference_keys); if (h_reference_ranks) delete[] h_reference_ranks; if (h_reference_keys) delete[] h_reference_keys; } /** * Test ascending/descending */ template void TestDirection( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { TestValueTypes(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); TestValueTypes(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); } /** * Test different bit ranges */ template void TestBits( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets) { // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way) or pre-sorted keys if ((Traits::CATEGORY == UNSIGNED_INTEGER) && (!std::is_same::value) && !pre_sorted) { // Partial bits int begin_bit = 1; int end_bit = (sizeof(KeyT) * 8) - 1; printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); // Equal bits begin_bit = end_bit = 0; printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); // Across subword boundaries int mid_bit = sizeof(KeyT) * 4; printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, mid_bit - 1, mid_bit + 1); } printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, 0, sizeof(KeyT) * 8); } template struct TransformFunctor1 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; template struct TransformFunctor2 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; /** * Test different segment iterators */ template void TestSegmentIterators( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, NumItemsT *d_segment_offsets) { InitializeSegments(num_items, num_segments, h_segment_offsets); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(NumItemsT) * (num_segments + 1), cudaMemcpyHostToDevice)); // Test with segment pointer. // This is also used to test non-segmented sort. TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets, d_segment_offsets + 1); if (num_segments > 1) { // Test with transform iterators of different types typedef TransformFunctor1 TransformFunctor1T; typedef TransformFunctor2 TransformFunctor2T; TransformInputIterator d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T()); TransformInputIterator d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T()); TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets_itr, d_segment_end_offsets_itr); } } /** * Test different segment compositions */ template void TestSegments( KeyT *h_keys, NumItemsT num_items, int max_segments, bool pre_sorted) { max_segments = static_cast(CUB_MIN(num_items, static_cast(max_segments))); NumItemsT *h_segment_offsets = new NumItemsT[max_segments + 1]; NumItemsT *d_segment_offsets = nullptr; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(NumItemsT) * (max_segments + 1))); for (int num_segments = max_segments; num_segments > 1; num_segments = cub::DivideAndRoundUp(num_segments, 64)) { // Pre-sorted tests are not supported for segmented sort if (num_items / num_segments < 128 * 1000 && !pre_sorted) { // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment TestSegmentIterators(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets); } } // Test single segment if (num_items > 0) { if (num_items < 128 * 1000 || pre_sorted) { // Right now we assign a single thread block to each segment, so lets // keep it to under 128K items per segment TestSegmentIterators(h_keys, num_items, 1, pre_sorted, h_segment_offsets, d_segment_offsets); } } if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); } /** * Test different NumItemsT, i.e. types of num_items */ template void TestNumItems(KeyT *h_keys, std::size_t num_items, int max_segments, bool pre_sorted) { if (!pre_sorted && num_items <= std::size_t(std::numeric_limits::max())) { TestSegments(h_keys, static_cast(num_items), max_segments, pre_sorted); } if (pre_sorted && num_items <= std::size_t(std::numeric_limits::max())) { TestSegments(h_keys, static_cast(num_items), max_segments, pre_sorted); } TestSegments(h_keys, num_items, max_segments, pre_sorted); } /** * Test different (sub)lengths and number of segments */ template void TestSizes(KeyT* h_keys, std::size_t max_items, int max_segments, bool pre_sorted) { if (pre_sorted) { // run a specific list of sizes, up to max_items std::size_t sizes[] = {g_smallest_pre_sorted_num_items, 4350000007ull}; for (std::size_t num_items : sizes) { if (num_items > max_items) break; TestNumItems(h_keys, num_items, max_segments, pre_sorted); } } else { for (std::size_t num_items = max_items; num_items > 1; num_items = cub::DivideAndRoundUp(num_items, 64)) { TestNumItems(h_keys, num_items, max_segments, pre_sorted); } } } /** * Test key sampling distributions */ template void TestGen( std::size_t max_items, int max_segments) { if (max_items == ~std::size_t(0)) { max_items = 8000003; } if (max_segments < 0) { max_segments = 5003; } std::unique_ptr h_keys(new KeyT[max_items]); // Test trivial problems sizes h_keys[0] = static_cast(42); TestNumItems(h_keys.get(), 0, 0, false); TestNumItems(h_keys.get(), 1, 1, false); for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 6) { printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout); InitializeKeyBits(RANDOM, h_keys.get(), max_items, entropy_reduction); TestSizes(h_keys.get(), max_items, max_segments, false); } if (cub::Traits::CATEGORY == cub::FLOATING_POINT) { printf("\nTesting random %s keys with some replaced with -0.0 or +0.0 \n", typeid(KeyT).name()); fflush(stdout); InitializeKeyBits(RANDOM_MINUS_PLUS_ZERO, h_keys.get(), max_items, 0); // This just tests +/- 0 handling -- don't need to test multiple sizes TestNumItems(h_keys.get(), max_items, max_segments, false); } printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout); InitializeKeyBits(UNIFORM, h_keys.get(), max_items, 0); TestSizes(h_keys.get(), max_items, max_segments, false); printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout); InitializeKeyBits(INTEGER_SEED, h_keys.get(), max_items, 0); TestSizes(h_keys.get(), max_items, max_segments, false); if (WITH_PRE_SORTED) { // Presorting is only used for testing large input arrays. const std::size_t large_num_items = std::size_t(4350000007ull); // A conservative check for memory, as we don't know ValueT or whether // the overwrite is allowed until later. // For ValueT, the check is actually exact unless TEST_VALUE_TYPE == 3. if (!HasEnoughMemory(large_num_items, true)) { printf("Skipping the permutation-based test due to insufficient device memory\n"); return; } h_keys.reset(nullptr); // Explicitly free old buffer before allocating. h_keys.reset(new KeyT[large_num_items]); printf("\nTesting pre-sorted and randomly permuted %s keys\n", typeid(KeyT).name()); fflush(stdout); InitializeKeysSorted(h_keys.get(), large_num_items); fflush(stdout); TestSizes(h_keys.get(), large_num_items, max_segments, true); fflush(stdout); } } //--------------------------------------------------------------------- // Simple test //--------------------------------------------------------------------- template < Backend BACKEND, typename KeyT, typename ValueT, bool IS_DESCENDING> void Test( std::size_t num_items, int num_segments, GenMode gen_mode, int entropy_reduction, int begin_bit, int end_bit) { const bool KEYS_ONLY = std::is_same::value; KeyT *h_keys = new KeyT[num_items]; std::size_t *h_reference_ranks = NULL; KeyT *h_reference_keys = NULL; ValueT *h_values = NULL; ValueT *h_reference_values = NULL; size_t *h_segment_offsets = new std::size_t[num_segments + 1]; std::size_t* d_segment_offsets = nullptr; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(std::size_t) * (num_segments + 1))); if (end_bit < 0) end_bit = sizeof(KeyT) * 8; InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction); InitializeSegments(num_items, num_segments, h_segment_offsets); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(std::size_t) * (num_segments + 1), cudaMemcpyHostToDevice)); InitializeSolution( h_keys, num_items, num_segments, false, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys); if (!KEYS_ONLY) { h_values = new ValueT[num_items]; h_reference_values = new ValueT[num_items]; for (std::size_t i = 0; i < num_items; ++i) { InitValue(INTEGER_SEED, h_values[i], i); InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]); } } if (h_reference_ranks) delete[] h_reference_ranks; printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout); Test( h_keys, h_values, num_items, num_segments, d_segment_offsets, d_segment_offsets + 1, begin_bit, end_bit, h_reference_keys, h_reference_values); if (h_keys) delete[] h_keys; if (h_reference_keys) delete[] h_reference_keys; if (h_values) delete[] h_values; if (h_reference_values) delete[] h_reference_values; if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); } #if TEST_VALUE_TYPE == 0 void TestUnspecifiedRanges() { const std::size_t num_items = 1024 * 1024; const std::size_t max_segments = 42; const std::size_t avg_segment_size = num_items / max_segments; for (int iteration = 0; iteration < 4; iteration++) { thrust::host_vector h_offsets_begin; thrust::host_vector h_offsets_end; h_offsets_begin.reserve(max_segments + 1); h_offsets_end.reserve(max_segments + 1); { int offset = 0; for (std::size_t sid = 0; sid < max_segments; sid++) { const int segment_size = static_cast(RandomValue(avg_segment_size)); const bool segment_is_utilized = segment_size > 0 && RandomValue(100) > 60; if (segment_is_utilized) { h_offsets_begin.push_back(offset); h_offsets_end.push_back(offset + segment_size); } offset += segment_size; } if (h_offsets_begin.empty()) { h_offsets_begin.push_back(avg_segment_size); h_offsets_end.push_back(num_items); } } thrust::device_vector keys(num_items); thrust::device_vector values(num_items); thrust::sequence(keys.rbegin(), keys.rend()); thrust::sequence(values.rbegin(), values.rend()); thrust::device_vector d_offsets_begin = h_offsets_begin; thrust::device_vector d_offsets_end = h_offsets_end; thrust::device_vector expected_keys = keys; thrust::device_vector expected_values = values; const int num_segments = static_cast(h_offsets_begin.size()); thrust::device_vector result_keys = keys; thrust::device_vector result_values = values; for (int sid = 0; sid < num_segments; sid++) { const int segment_begin = h_offsets_begin[sid]; const int segment_end = h_offsets_end[sid]; thrust::sort_by_key(expected_keys.begin() + segment_begin, expected_keys.begin() + segment_end, expected_values.begin() + segment_begin); } { cub::DoubleBuffer keys_buffer( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data())); cub::DoubleBuffer values_buffer( thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data())); std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{nullptr}; CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); for (int sid = 0; sid < num_segments; sid++) { const int segment_begin = h_offsets_begin[sid]; const int segment_end = h_offsets_end[sid]; if (keys_buffer.selector == 0) { thrust::copy( keys.begin() + segment_begin, keys.begin() + segment_end, result_keys.begin() + segment_begin); } if (values_buffer.selector == 0) { thrust::copy( values.begin() + segment_begin, values.begin() + segment_end, result_values.begin() + segment_begin); } } } AssertEquals(result_keys, expected_keys); AssertEquals(result_values, expected_values); thrust::sequence(keys.rbegin(), keys.rend()); thrust::sequence(values.rbegin(), values.rend()); result_keys = keys; result_values = values; { std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{}; CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data()), num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data()), num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); } AssertEquals(result_values, expected_values); AssertEquals(result_keys, expected_keys); } } #endif //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { std::size_t num_items = ~std::size_t(0); int num_segments = -1; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("s", num_segments); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--s= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_KEY_BYTES bytes 1:2:4:8 // %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3 // 0->Keys only // 1->uchar // 2->uint // 3->[ull,TestBar] (TestBar is cheap to build, included here to // reduce total number of targets) // To reduce testing time, some key types are only tested when not // testing pairs: #if TEST_VALUE_TYPE == 0 #define TEST_EXTENDED_KEY_TYPES #endif // Compile/run thorough tests #if TEST_KEY_BYTES == 1 TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); TestGen (num_items, num_segments); TestGen (num_items, num_segments); #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 2 TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA TestGen (num_items, num_segments); #endif // CTK >= 9 #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA #if !defined(__ICC) // Fails with `-0 != 0` with ICC for unknown reasons. See #333. TestGen (num_items, num_segments); #endif // !ICC #endif // CTK >= 11 #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 4 TestGen (num_items, num_segments); #if TEST_VALUE_TYPE == 0 TestUnspecifiedRanges(); #endif #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); TestGen (num_items, num_segments); #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 8 TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); TestGen(num_items, num_segments); #endif // TEST_EXTENDED_KEY_TYPES #endif // TEST_KEY_BYTES switch return 0; } cub-2.0.1/test/test_device_reduce.cu000066400000000000000000001551001434614775400174410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceReduce utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include "test_util.h" #include #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- int g_ptx_version; int g_sm_count; double g_device_giga_bandwidth; bool g_verbose = false; bool g_verbose_input = false; int g_timing_iterations = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CUB_SEGMENTED, // CUB segmented method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method CDP_SEGMENTED, // GPU-based segmented method }; inline const char* BackendToString(Backend b) { switch (b) { case CUB: return "CUB"; case CUB_SEGMENTED: return "CUB_SEGMENTED"; case CDP: return "CDP"; case CDP_SEGMENTED: return "CDP_SEGMENTED"; default: break; } return ""; } // Custom max functor struct CustomMax { /// Boolean max operator, returns (a > b) ? a : b template __host__ __device__ auto operator()(T&& a, C&& b) -> cub::detail::accumulator_t { return CUB_MAX(a, b); } }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceReduce entrypoints //--------------------------------------------------------------------- /** * Dispatch to reduce entrypoint (custom-max) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, ReductionOpT reduction_op) { using InputT = cub::detail::value_t; // The output value type using OutputT = cub::detail::non_void_value_t; // Max-identity OutputT identity = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, identity); } return error; } /** * Dispatch to sum entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::Sum /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to min entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::Min /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to max entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::Max /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to argmin entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::ArgMin /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to argmax entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::ArgMax /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } //--------------------------------------------------------------------- // Dispatch to different CUB DeviceSegmentedReduce entrypoints //--------------------------------------------------------------------- /** * Dispatch to reduce entrypoint (custom-max) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { // The input value type using InputT = cub::detail::value_t; // The output value type using OutputT = cub::detail::non_void_value_t; // Max-identity OutputT identity = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, identity); } return error; } /** * Dispatch to sum entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::Sum /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to min entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::Min /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to max entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::Max /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to argmin entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::ArgMin /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to argmax entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::ArgMax /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } //--------------------------------------------------------------------- // CUDA nested-parallelism test kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceReduce */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, int timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { *d_cdp_error = Dispatch(cub_backend, timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op); *d_temp_storage_bytes = temp_storage_bytes; } /** * Launch kernel and dispatch on device. Should only be called from host code. * The CubBackend should be one of the non-CDP CUB backends to invoke from the * device. */ template cudaError_t LaunchCDPKernel(Int2Type cub_backend, int timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, cub_backend, timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op); CubDebugExit(retval); CubDebugExit(cub::detail::device_synchronize()); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } // Specializations of Dispatch that translate the CDP backend to the appropriate // CUB backend, and uses the CUB backend to launch the CDP kernel. #define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \ template \ cudaError_t Dispatch(Int2Type, \ int timing_iterations, \ size_t *d_temp_storage_bytes, \ cudaError_t *d_cdp_error, \ \ void *d_temp_storage, \ size_t &temp_storage_bytes, \ InputIteratorT d_in, \ OutputIteratorT d_out, \ int num_items, \ int max_segments, \ BeginOffsetIteratorT d_segment_begin_offsets, \ EndOffsetIteratorT d_segment_end_offsets, \ ReductionOpT reduction_op) \ { \ Int2Type cub_backend{}; \ return LaunchCDPKernel(cub_backend, \ timing_iterations, \ d_temp_storage_bytes, \ d_cdp_error, \ d_temp_storage, \ temp_storage_bytes, \ d_in, \ d_out, \ num_items, \ max_segments, \ d_segment_begin_offsets, \ d_segment_end_offsets, \ reduction_op); \ } DEFINE_CDP_DISPATCHER(CDP, CUB) DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED) #undef DEFINE_CDP_DISPATCHER #endif // TEST_CDP //--------------------------------------------------------------------- // Problem generation //--------------------------------------------------------------------- /// Initialize problem template void Initialize( GenMode gen_mode, InputT *h_in, int num_items) { for (int i = 0; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); } if (g_verbose_input) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /// Solve problem (max/custom-max functor) template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, OutputT(h_in[j])); h_reference[i] = aggregate; } } }; /// Solve problem (min functor) template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::Min reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate = Traits::Max(); // replace with std::numeric_limits::max() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, OutputT(h_in[j])); h_reference[i] = aggregate; } } }; /// Solve problem (sum functor) template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::Sum reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate; InitValue(INTEGER_SEED, aggregate, 0); for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, h_in[j]); h_reference[i] = static_cast(aggregate); } } }; /// Solve problem (argmin functor) template struct Solution { typedef KeyValuePair OutputT; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMin reduction_op) { for (int i = 0; i < num_segments; ++i) { OutputT aggregate(1, Traits::Max()); // replace with std::numeric_limits::max() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) { OutputT item(j - h_segment_begin_offsets[i], OutputValueT(h_in[j])); aggregate = reduction_op(aggregate, item); } h_reference[i] = aggregate; } } }; /// Solve problem (argmax functor) template struct Solution { typedef KeyValuePair OutputT; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMax reduction_op) { for (int i = 0; i < num_segments; ++i) { OutputT aggregate(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) { OutputT item(j - h_segment_begin_offsets[i], OutputValueT(h_in[j])); aggregate = reduction_op(aggregate, item); } h_reference[i] = aggregate; } } }; //--------------------------------------------------------------------- // Problem generation //--------------------------------------------------------------------- /// Test DeviceReduce for a given problem input template < typename BackendT, typename DeviceInputIteratorT, typename DeviceOutputIteratorT, typename HostReferenceIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT> void Test( BackendT backend, DeviceInputIteratorT d_in, DeviceOutputIteratorT d_out, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op, HostReferenceIteratorT h_reference) { // Input data types using InputT = cub::detail::value_t; // Allocate CDP device arrays for temp storage size and error size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Inquire temp device storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(backend, 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op)); // Allocate temp device storage CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run warmup/correctness iteration CubDebugExit(Dispatch(backend, 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance if (g_timing_iterations > 0) { GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(backend, g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * sizeof(InputT); printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare); } /// Test DeviceReduce template < Backend BACKEND, typename OutputValueT, typename HostInputIteratorT, typename DeviceInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT> void SolveAndTest( HostInputIteratorT h_in, DeviceInputIteratorT d_in, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { using InputValueT = cub::detail::value_t; using SolutionT = Solution; using OutputT = typename SolutionT::OutputT; printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n", BackendToString(BACKEND), typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments); fflush(stdout); // Allocate and solve solution OutputT *h_reference = new OutputT[num_segments]; SolutionT::Solve(h_in, h_reference, num_segments, h_segment_begin_offsets, h_segment_end_offsets, reduction_op); // Run with discard iterator DiscardOutputIterator discard_itr; Test(Int2Type(), d_in, discard_itr, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference); // Run with output data OutputT *d_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments)); CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments)); Test(Int2Type(), d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference); // Cleanup if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (h_reference) delete[] h_reference; } /// Test specific problem type template < Backend BACKEND, typename InputT, typename OutputT, typename OffsetT, typename ReductionOpT> void TestProblem( OffsetT num_items, OffsetT num_segments, GenMode gen_mode, ReductionOpT reduction_op) { printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); fflush(stdout); // Initialize value data InputT* h_in = new InputT[num_items]; Initialize(gen_mode, h_in, num_items); // Initialize segment data OffsetT *h_segment_offsets = new OffsetT[num_segments + 1]; InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); // Initialize device data OffsetT *d_segment_offsets = NULL; InputT *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1, reduction_op); if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); if (h_in) delete[] h_in; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /// Test different operators template < Backend BACKEND, typename OutputT, typename HostInputIteratorT, typename DeviceInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> void TestByOp( HostInputIteratorT h_in, DeviceInputIteratorT d_in, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets) { SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, CustomMax()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Sum()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Min()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMin()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Max()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMax()); } template struct TransformFunctor1 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; template struct TransformFunctor2 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; /// Test different backends template < typename InputT, typename OutputT, typename OffsetT> void TestByBackend( OffsetT num_items, OffsetT max_segments, GenMode gen_mode) { #if TEST_CDP == 0 constexpr auto NonSegmentedBackend = CUB; constexpr auto SegmentedBackend = CUB_SEGMENTED; #else // TEST_CDP constexpr auto NonSegmentedBackend = CDP; constexpr auto SegmentedBackend = CDP_SEGMENTED; #endif // TEST_CDP // Initialize host data printf("\n\nInitializing %d %s -> %s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); InputT *h_in = new InputT[num_items]; OffsetT *h_segment_offsets = new OffsetT[max_segments + 1]; Initialize(gen_mode, h_in, num_items); // Initialize device data InputT *d_in = NULL; OffsetT *d_segment_offsets = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); // // Test single-segment implementations // InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); // Page-aligned-input tests TestByOp(h_in, d_in, num_items, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); // Non-page-aligned-input tests if (num_items > 1) { InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input); TestByOp(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); } // // Test segmented implementation // // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment int max_items_per_segment = 128000; for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment); num_segments < max_segments; num_segments = (num_segments * 32) + 1) { // Test with segment pointer InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); TestByOp(h_in, d_in, num_items, num_segments, h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1); // Test with segment iterator typedef CastOp IdentityOpT; IdentityOpT identity_op; TransformInputIterator h_segment_offsets_itr( h_segment_offsets, identity_op); TransformInputIterator d_segment_offsets_itr( d_segment_offsets, identity_op); TestByOp(h_in, d_in, num_items, num_segments, h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1); // Test with transform iterators of different types typedef TransformFunctor1 TransformFunctor1T; typedef TransformFunctor2 TransformFunctor2T; TransformInputIterator h_segment_begin_offsets_itr(h_segment_offsets, TransformFunctor1T()); TransformInputIterator h_segment_end_offsets_itr(h_segment_offsets + 1, TransformFunctor2T()); TransformInputIterator d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T()); TransformInputIterator d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T()); TestByOp(h_in, d_in, num_items, num_segments, h_segment_begin_offsets_itr, h_segment_end_offsets_itr, d_segment_begin_offsets_itr, d_segment_end_offsets_itr); } if (h_in) delete[] h_in; if (h_segment_offsets) delete[] h_segment_offsets; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); } /// Test different input-generation modes template < typename InputT, typename OutputT, typename OffsetT> void TestByGenMode( OffsetT num_items, OffsetT max_segments) { // // Test pointer support using different input-generation modes // TestByBackend(num_items, max_segments, UNIFORM); TestByBackend(num_items, max_segments, INTEGER_SEED); TestByBackend(num_items, max_segments, RANDOM); // // Test iterator support using a constant-iterator and SUM // InputT val; InitValue(UNIFORM, val, 0); ConstantInputIterator h_in(val); OffsetT *h_segment_offsets = new OffsetT[1 + 1]; InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); #if TEST_CDP == 0 constexpr auto Backend = CUB; #else // TEST_CDP constexpr auto Backend = CDP; #endif // TEST_CDP SolveAndTest(h_in, h_in, num_items, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum()); if (h_segment_offsets) delete[] h_segment_offsets; } /// Test different problem sizes template void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size) { // Test 0, 1, many TestByGenMode(0, max_segments); TestByGenMode(1, max_segments); TestByGenMode(max_items, max_segments); // Test random problem sizes from a log-distribution [8, max_items-ish) int num_iterations = 8; double max_exp = log(double(max_items)) / log(double(2.0)); for (int i = 0; i < num_iterations; ++i) { OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0); TestByGenMode(num_items, max_segments); } // // White-box testing of single-segment problems around specific sizes // #if TEST_CDP == 0 constexpr auto Backend = CUB; #else // TEST_CDP constexpr auto Backend = CDP; #endif // TEST_CDP // Tile-boundaries: multiple blocks, one tile per block TestProblem(tile_size * 4, 1, RANDOM, Sum()); TestProblem(tile_size * 4 + 1, 1, RANDOM, Sum()); TestProblem(tile_size * 4 - 1, 1, RANDOM, Sum()); // Tile-boundaries: multiple blocks, multiple tiles per block OffsetT sm_occupancy = 32; OffsetT occupancy = tile_size * sm_occupancy * g_sm_count; TestProblem(occupancy, 1, RANDOM, Sum()); TestProblem(occupancy + 1, 1, RANDOM, Sum()); TestProblem(occupancy - 1, 1, RANDOM, Sum()); }; class CustomInputT { char m_val{}; public: __host__ __device__ explicit CustomInputT(char val) : m_val(val) {} __host__ __device__ int get() const { return static_cast(m_val); } }; class CustomAccumulatorT { int m_val{0}; int m_magic_value{42}; __host__ __device__ CustomAccumulatorT(int val) : m_val(val) {} public: __host__ __device__ CustomAccumulatorT() {} __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in) : m_val(in.is_valid() * in.get()) , m_magic_value(in.is_valid() * 42) {} __host__ __device__ void operator=(const CustomInputT &in) { if (this->is_valid()) { m_val = in.get(); } } __host__ __device__ void operator=(const CustomAccumulatorT &in) { if (this->is_valid() && in.is_valid()) { m_val = in.get(); } } __host__ __device__ CustomAccumulatorT operator+(const CustomInputT &in) const { const int multiplier = this->is_valid(); return {(m_val + in.get()) * multiplier}; } __host__ __device__ CustomAccumulatorT operator+(const CustomAccumulatorT &in) const { const int multiplier = this->is_valid() && in.is_valid(); return {(m_val + in.get()) * multiplier}; } __host__ __device__ int get() const { return m_val; } __host__ __device__ bool is_valid() const { return m_magic_value == 42; } }; class CustomOutputT { bool *m_d_flag{}; int m_expected{}; public: __host__ __device__ CustomOutputT(bool *d_flag, int expected) : m_d_flag(d_flag) , m_expected(expected) {} __host__ __device__ void operator=(const CustomAccumulatorT &accum) const { *m_d_flag = accum.is_valid() && (accum.get() == m_expected); } }; __global__ void InitializeTestAccumulatorTypes(int num_items, int expected, bool *d_flag, CustomInputT *d_in, CustomOutputT *d_out) { const int idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (idx < num_items) { d_in[idx] = CustomInputT(1); } if (idx == 0) { *d_out = CustomOutputT{d_flag, expected}; } } void TestAccumulatorTypes() { const int num_items = 2 * 1024 * 1024; const int expected = num_items; const int block_size = 256; const int grid_size = (num_items + block_size - 1) / block_size; CustomInputT *d_in{}; CustomOutputT *d_out{}; CustomAccumulatorT init{}; bool *d_flag{}; CubDebugExit( g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT))); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool))); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, sizeof(CustomInputT) * num_items)); InitializeTestAccumulatorTypes<<>>(num_items, expected, d_flag, d_in, d_out); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum{}, init)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cudaMemset(d_temp_storage, 1, temp_storage_bytes)); CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum{}, init)); bool ok{}; CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost)); AssertTrue(ok); CubDebugExit(g_allocator.DeviceFree(d_out)); CubDebugExit(g_allocator.DeviceFree(d_in)); } template struct GetTileSize { OffsetT max_items{}; OffsetT max_segments{}; OffsetT tile_size{}; GetTileSize(OffsetT max_items, OffsetT max_segments) : max_items(max_items) , max_segments(max_segments) {} template CUB_RUNTIME_FUNCTION cudaError_t Invoke() { this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD; return cudaSuccess; } }; /// Test problem type template void TestType(OffsetT max_items, OffsetT max_segments) { // Inspect the tuning policies to determine this arch's tile size: using MaxPolicyT = typename DeviceReducePolicy::MaxPolicy; GetTileSize dispatch(max_items, max_segments); CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch)); TestBySize(max_items, max_segments, dispatch.tile_size); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { typedef int OffsetT; OffsetT max_items = 27000000; OffsetT max_segments = 34000; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_verbose_input = args.CheckCmdLineFlag("v2"); args.GetCmdLineArgument("n", max_items); args.GetCmdLineArgument("s", max_segments); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--s= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; // Get ptx version CubDebugExit(PtxVersion(g_ptx_version)); // Get SM count g_sm_count = args.deviceProp.multiProcessorCount; // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_TYPES types 0:1:2:3 #if TEST_TYPES == 0 TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); #elif TEST_TYPES == 1 TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); #elif TEST_TYPES == 2 TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); #else // TEST_TYPES == 3 TestType(max_items, max_segments); TestType(max_items, max_segments); TestAccumulatorTypes(); #endif printf("\n"); return 0; } cub-2.0.1/test/test_device_reduce_by_key.cu000066400000000000000000000616711434614775400210140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceReduce::ReduceByKey utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; //--------------------------------------------------------------------- // Dispatch to different CUB entrypoints //--------------------------------------------------------------------- /** * Dispatch to reduce-by-key entrypoint */ template < typename KeyInputIteratorT, typename KeyOutputIteratorT, typename ValueInputIteratorT, typename ValueOutputIteratorT, typename NumRunsIteratorT, typename EqualityOpT, typename ReductionOpT, typename OffsetT> CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, ValueInputIteratorT d_values_in, ValueOutputIteratorT d_values_out, NumRunsIteratorT d_num_runs, EqualityOpT /*equality_op*/, ReductionOpT reduction_op, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceReduce::ReduceByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, reduction_op, num_items); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceSelect */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, ValueInputIteratorT d_values_in, ValueOutputIteratorT d_values_out, NumRunsIteratorT d_num_runs, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items) { *d_cdp_error = Dispatch(cub_backend, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template __forceinline__ cudaError_t Dispatch(Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, ValueInputIteratorT d_values_in, ValueOutputIteratorT d_values_out, NumRunsIteratorT d_num_runs, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items) { // Invoke kernel to invoke device-side dispatch cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, Int2Type{}, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( int entropy_reduction, T *h_in, int num_items, int max_segment) { unsigned int max_int = (unsigned int) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences int repeat; if (max_segment < 0) { repeat = num_items; } else if (max_segment < 2) { repeat = 1; } else { RandomBits(repeat, entropy_reduction); repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int)); repeat = CUB_MAX(1, repeat); } int j = i; while (j < CUB_MIN(i + repeat, num_items)) { InitValue(INTEGER_SEED, h_in[j], key); j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve problem. Returns total number of segments identified */ template < typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyT, typename ValueT, typename EqualityOpT, typename ReductionOpT> int Solve( KeyInputIteratorT h_keys_in, KeyT *h_keys_reference, ValueInputIteratorT h_values_in, ValueT *h_values_reference, EqualityOpT equality_op, ReductionOpT reduction_op, int num_items) { using AccumT = cub::detail::accumulator_t; // First item KeyT previous = h_keys_in[0]; AccumT aggregate = h_values_in[0]; int num_segments = 0; // Subsequent items for (int i = 1; i < num_items; ++i) { if (!equality_op(previous, h_keys_in[i])) { h_keys_reference[num_segments] = previous; h_values_reference[num_segments] = static_cast(aggregate); num_segments++; aggregate = h_values_in[i]; } else { aggregate = static_cast(reduction_op(aggregate, h_values_in[i])); } previous = h_keys_in[i]; } h_keys_reference[num_segments] = previous; h_values_reference[num_segments] = static_cast(aggregate); num_segments++; return num_segments; } /** * Test DeviceSelect for a given problem input */ template < Backend BACKEND, typename DeviceKeyInputIteratorT, typename DeviceValueInputIteratorT, typename KeyT, typename ValueT, typename EqualityOpT, typename ReductionOpT> void Test( DeviceKeyInputIteratorT d_keys_in, DeviceValueInputIteratorT d_values_in, KeyT* h_keys_reference, ValueT* h_values_reference, EqualityOpT equality_op, ReductionOpT reduction_op, int num_segments, int num_items) { // Allocate device output arrays and number of segments KeyT* d_keys_out = NULL; ValueT* d_values_out = NULL; int* d_num_runs = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int))); // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Clear device output arrays CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items)); CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items)); CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int))); // Run warmup/correctness iteration CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items)); // Check for correctness (and display results, if specified) int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose); printf("\t Keys %s ", compare1 ? "FAIL" : "PASS"); int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose); printf("\t Values %s ", compare2 ? "FAIL" : "PASS"); int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose); printf("\t Count %s ", compare3 ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; int bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT)); float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f; printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth); } printf("\n\n"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Cleanup if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out)); if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out)); if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare1 | compare2 | compare3); } /** * Test DeviceSelect on pointer type */ template < Backend BACKEND, typename KeyT, typename ValueT, typename ReductionOpT> void TestPointer( int num_items, int entropy_reduction, int max_segment, ReductionOpT reduction_op) { // Allocate host arrays KeyT* h_keys_in = new KeyT[num_items]; KeyT* h_keys_reference = new KeyT[num_items]; ValueT* h_values_in = new ValueT[num_items]; ValueT* h_values_reference = new ValueT[num_items]; for (int i = 0; i < num_items; ++i) InitValue(INTEGER_SEED, h_values_in[i], 1); // Initialize problem and solution Equality equality_op; Initialize(entropy_reduction, h_keys_in, num_items, max_segment); int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items); printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_same::value) ? "Sum" : "Max", num_items, num_segments, float(num_items) / num_segments, typeid(KeyT).name(), typeid(ValueT).name(), max_segment, entropy_reduction); fflush(stdout); // Allocate problem device arrays KeyT *d_keys_in = NULL; ValueT *d_values_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items); // Cleanup if (h_keys_in) delete[] h_keys_in; if (h_values_in) delete[] h_values_in; if (h_keys_reference) delete[] h_keys_reference; if (h_values_reference) delete[] h_values_reference; if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in)); } /** * Test on iterator type */ template < Backend BACKEND, typename KeyT, typename ValueT, typename ReductionOpT> void TestIterator( int num_items, int entropy_reduction, int max_segment, ReductionOpT reduction_op) { // Allocate host arrays KeyT* h_keys_in = new KeyT[num_items]; KeyT* h_keys_reference = new KeyT[num_items]; ValueT one_val; InitValue(INTEGER_SEED, one_val, 1); ConstantInputIterator h_values_in(one_val); ValueT* h_values_reference = new ValueT[num_items]; // Initialize problem and solution Equality equality_op; Initialize(entropy_reduction, h_keys_in, num_items, max_segment); int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items); printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_same::value) ? "Sum" : "Max", num_items, num_segments, float(num_items) / num_segments, typeid(KeyT).name(), typeid(ValueT).name(), max_segment, entropy_reduction); fflush(stdout); // Allocate problem device arrays KeyT *d_keys_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items); // Cleanup if (h_keys_in) delete[] h_keys_in; if (h_keys_reference) delete[] h_keys_reference; if (h_values_reference) delete[] h_values_reference; if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); } /** * Test different gen modes */ template < Backend BACKEND, typename KeyT, typename ValueT, typename ReductionOpT> void Test( int num_items, ReductionOpT reduction_op, int max_segment) { // 0 key-bit entropy reduction rounds TestPointer(num_items, 0, max_segment, reduction_op); if (max_segment > 1) { // 2 key-bit entropy reduction rounds TestPointer(num_items, 2, max_segment, reduction_op); // 7 key-bit entropy reduction rounds TestPointer(num_items, 7, max_segment, reduction_op); } } /** * Test different avg segment lengths modes */ template < Backend BACKEND, typename KeyT, typename ValueT, typename ReductionOpT> void Test( int num_items, ReductionOpT reduction_op) { Test(num_items, reduction_op, -1); Test(num_items, reduction_op, 1); // Evaluate different max-segment lengths for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11) { Test(num_items, reduction_op, max_segment); } } /** * Test different dispatch */ template < typename KeyT, typename ValueT, typename ReductionOpT> void TestDispatch( int num_items, ReductionOpT reduction_op) { #if TEST_CDP == 0 Test(num_items, reduction_op); #elif TEST_CDP == 1 Test(num_items, reduction_op); #endif // TEST_CDP } /** * Test different input sizes */ template < typename KeyT, typename ValueT, typename ReductionOpT> void TestSize( int num_items, ReductionOpT reduction_op) { if (num_items < 0) { TestDispatch(1, reduction_op); TestDispatch(100, reduction_op); TestDispatch(10000, reduction_op); TestDispatch(1000000, reduction_op); } else { TestDispatch(num_items, reduction_op); } } template < typename KeyT, typename ValueT> void TestOp( int num_items) { TestSize(num_items, cub::Sum()); TestSize(num_items, cub::Max()); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; int entropy_reduction = 0; int maxseg = 1000; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("maxseg", maxseg); args.GetCmdLineArgument("entropy", entropy_reduction); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--maxseg=]" "[--entropy=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("\n"); // Get ptx version int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); // %PARAM% TEST_CDP cdp 0:1 // Test different input types TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); return 0; } cub-2.0.1/test/test_device_run_length_encode.cu000066400000000000000000000630711434614775400216610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceRunLengthEncode utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; // Operation types enum RleMethod { RLE, // Run length encode NON_TRIVIAL, }; //--------------------------------------------------------------------- // Dispatch to different CUB entrypoints //--------------------------------------------------------------------- /** * Dispatch to run-length encode entrypoint */ template < typename InputIteratorT, typename UniqueOutputIteratorT, typename OffsetsOutputIteratorT, typename LengthsOutputIteratorT, typename NumRunsIterator, typename OffsetT> CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*method*/, Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, OffsetsOutputIteratorT /*d_offsets_out*/, LengthsOutputIteratorT d_lengths_out, NumRunsIterator d_num_runs, cub::Equality /*equality_op*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceRunLengthEncode::Encode( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_lengths_out, d_num_runs, num_items); } return error; } /** * Dispatch to non-trivial runs entrypoint */ template < typename InputIteratorT, typename UniqueOutputIteratorT, typename OffsetsOutputIteratorT, typename LengthsOutputIteratorT, typename NumRunsIterator, typename OffsetT> CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*method*/, Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT /*d_unique_out*/, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsIterator d_num_runs, cub::Equality /*equality_op*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceRunLengthEncode::NonTrivialRuns( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs, num_items); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceRunLengthEncode */ template __global__ void CDPDispatchKernel(Int2Type method, Int2Type cub_backend, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsIterator d_num_runs, cub::Equality equality_op, OffsetT num_items) { *d_cdp_error = Dispatch(method, cub_backend, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template __forceinline__ cudaError_t Dispatch(Int2Type method, Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsIterator d_num_runs, EqualityOp equality_op, OffsetT num_items) { // Invoke kernel to invoke device-side dispatch cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, method, Int2Type{}, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( int entropy_reduction, T *h_in, int num_items, int max_segment) { unsigned int max_int = (unsigned int) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences for the current run int repeat; if (max_segment < 0) { repeat = num_items; } else if (max_segment < 2) { repeat = 1; } else { RandomBits(repeat, entropy_reduction); repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int)); repeat = CUB_MAX(1, repeat); } int j = i; while (j < CUB_MIN(i + repeat, num_items)) { InitValue(INTEGER_SEED, h_in[j], key); j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve problem. Returns total number of segments identified */ template < RleMethod RLE_METHOD, typename InputIteratorT, typename T, typename OffsetT, typename LengthT, typename EqualityOp> int Solve( InputIteratorT h_in, T *h_unique_reference, OffsetT *h_offsets_reference, LengthT *h_lengths_reference, EqualityOp equality_op, int num_items) { if (num_items == 0) return 0; // First item T previous = h_in[0]; LengthT length = 1; int num_runs = 0; int run_begin = 0; // Subsequent items for (int i = 1; i < num_items; ++i) { if (!equality_op(previous, h_in[i])) { if ((RLE_METHOD != NON_TRIVIAL) || (length > 1)) { h_unique_reference[num_runs] = previous; h_offsets_reference[num_runs] = run_begin; h_lengths_reference[num_runs] = length; num_runs++; } length = 1; run_begin = i; } else { length++; } previous = h_in[i]; } if ((RLE_METHOD != NON_TRIVIAL) || (length > 1)) { h_unique_reference[num_runs] = previous; h_offsets_reference[num_runs] = run_begin; h_lengths_reference[num_runs] = length; num_runs++; } return num_runs; } /** * Test DeviceRunLengthEncode for a given problem input */ template < RleMethod RLE_METHOD, Backend BACKEND, typename DeviceInputIteratorT, typename T, typename OffsetT, typename LengthT, typename EqualityOp> void Test( DeviceInputIteratorT d_in, T *h_unique_reference, OffsetT *h_offsets_reference, LengthT *h_lengths_reference, EqualityOp equality_op, int num_runs, int num_items) { // Allocate device output arrays and number of segments T* d_unique_out = NULL; LengthT* d_offsets_out = NULL; OffsetT* d_lengths_out = NULL; int* d_num_runs = NULL; if (RLE_METHOD == RLE) CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items)); if (RLE_METHOD == NON_TRIVIAL) CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int))); // Allocate CDP device arrays size_t* d_temp_storage_bytes = NULL; cudaError_t* d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(Int2Type(), Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Clear device output arrays if (RLE_METHOD == RLE) CubDebugExit(cudaMemset(d_unique_out, 0, sizeof(T) * num_items)); if (RLE_METHOD == NON_TRIVIAL) CubDebugExit(cudaMemset(d_offsets_out, 0, sizeof(OffsetT) * num_items)); CubDebugExit(cudaMemset(d_lengths_out, 0, sizeof(LengthT) * num_items)); CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int))); // Run warmup/correctness iteration CubDebugExit(Dispatch(Int2Type(), Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items)); // Check for correctness (and display results, if specified) int compare0 = 0; int compare1 = 0; int compare2 = 0; int compare3 = 0; if (RLE_METHOD == RLE) { compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose); printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS"); } if (RLE_METHOD != RLE) { compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose); printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS"); } compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose); printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS"); compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose); printf("\t Count %s\n", compare3 ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT))); float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f; printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth); } printf("\n\n"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Cleanup if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out)); if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out)); if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out)); if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare0 | compare1 | compare2 | compare3); } /** * Test DeviceRunLengthEncode on pointer type */ template < RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT> void TestPointer( int num_items, int entropy_reduction, int max_segment) { // Allocate host arrays T* h_in = new T[num_items]; T* h_unique_reference = new T[num_items]; OffsetT* h_offsets_reference = new OffsetT[num_items]; LengthT* h_lengths_reference = new LengthT[num_items]; for (int i = 0; i < num_items; ++i) InitValue(INTEGER_SEED, h_offsets_reference[i], 1); // Initialize problem and solution Equality equality_op; Initialize(entropy_reduction, h_in, num_items, max_segment); int num_runs = Solve(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items); printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n", (RLE_METHOD == RLE) ? "DeviceRunLengthEncode::Encode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_runs, float(num_items) / num_runs, typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(), max_segment, entropy_reduction); fflush(stdout); // Allocate problem device arrays T* d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items); // Cleanup if (h_in) delete[] h_in; if (h_unique_reference) delete[] h_unique_reference; if (h_offsets_reference) delete[] h_offsets_reference; if (h_lengths_reference) delete[] h_lengths_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /** * Test on iterator type */ template < RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT> void TestIterator( int num_items, Int2Type /*is_primitive*/) { // Allocate host arrays T* h_unique_reference = new T[num_items]; OffsetT* h_offsets_reference = new OffsetT[num_items]; LengthT* h_lengths_reference = new LengthT[num_items]; T one_val; InitValue(INTEGER_SEED, one_val, 1); ConstantInputIterator h_in(one_val); // Initialize problem and solution Equality equality_op; int num_runs = Solve(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items); printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n", (RLE_METHOD == RLE) ? "DeviceRunLengthEncode::Encode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_runs, float(num_items) / num_runs, typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name()); fflush(stdout); // Run Test Test(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items); // Cleanup if (h_unique_reference) delete[] h_unique_reference; if (h_offsets_reference) delete[] h_offsets_reference; if (h_lengths_reference) delete[] h_lengths_reference; } template < RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT> void TestIterator( int /*num_items*/, Int2Type /*is_primitive*/) {} /** * Test different gen modes */ template void Test(int num_items) { // Test iterator (one run) TestIterator( num_items, Int2Type::PRIMITIVE>()); // Evaluate different run lengths / segment sizes const int max_seg_limit = CUB_MIN(num_items, 1 << 16); const int max_seg_inc = 4; for (int max_segment = 1, entropy_reduction = 0; max_segment <= max_seg_limit; max_segment <<= max_seg_inc, entropy_reduction++) { const int max_seg = CUB_MAX(1, max_segment); TestPointer(num_items, entropy_reduction, max_seg); } } /** * Test different dispatch */ template < typename T, typename OffsetT, typename LengthT> void TestDispatch( int num_items) { #if TEST_CDP == 0 Test(num_items); Test(num_items); #elif TEST_CDP == 1 Test(num_items); Test(num_items); #endif } /** * Test different input sizes */ template < typename T, typename OffsetT, typename LengthT> void TestSize( int num_items) { if (num_items < 0) { TestDispatch(0); TestDispatch(1); TestDispatch(100); TestDispatch(10000); TestDispatch(1000000); } else { TestDispatch(num_items); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("\n"); // %PARAM% TEST_CDP cdp 0:1 // Test different input types TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); return 0; } cub-2.0.1/test/test_device_scan.cu000066400000000000000000001174601434614775400171250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceScan utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; double g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ template struct WrapperFunctor { OpT op; WrapperFunctor(OpT op) : op(op) {} template __host__ __device__ __forceinline__ auto operator()(const T &a, const U &b) const -> decltype(op(a, b)) { return static_cast(op(a, b)); } }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceScan entrypoints //--------------------------------------------------------------------- /** * Dispatch to exclusive scan entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, IsPrimitiveT /*is_primitive*/, int timing_timing_iterations, size_t * /* d_temp_storage_bytes */, cudaError_t * /* d_cdp_error */, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT /* d_out */, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, scan_op, initial_value, num_items); } return error; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, IsPrimitiveT /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items); } return error; } /** * Dispatch to exclusive sum entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, Int2Type /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT /* d_out */, Sum /*scan_op*/, InitialValueT /*initial_value*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, num_items); } return error; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, Int2Type /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, Sum /*scan_op*/, InitialValueT /*initial_value*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to inclusive scan entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, IsPrimitiveT /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT /* d_out */, ScanOpT scan_op, NullType /* initial_value */, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, scan_op, num_items); } return error; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, IsPrimitiveT /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, NullType /*initial_value*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items); } return error; } /** * Dispatch to inclusive sum entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, Int2Type /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT /* d_out */, Sum /*scan_op*/, NullType /*initial_value*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, num_items); } return error; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type /*in_place*/, Int2Type /*dispatch_to*/, Int2Type /*is_primitive*/, int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, Sum /*scan_op*/, NullType /*initial_value*/, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceScan */ template __global__ void CDPDispatchKernel(InPlaceT in_place, CubBackendT cub_backend, IsPrimitiveT is_primitive, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items) { *d_cdp_error = Dispatch(in_place, cub_backend, is_primitive, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template cudaError_t Dispatch(InPlaceT in_place, Int2Type dispatch_to, IsPrimitiveT is_primitive, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items) { // Invoke kernel to invoke device-side dispatch to CUB backend: (void)dispatch_to; using CubBackendT = Int2Type; CubBackendT cub_backend; cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, in_place, cub_backend, is_primitive, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( GenMode gen_mode, T *h_in, int num_items) { for (int i = 0; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve exclusive-scan problem */ template < typename InputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT> void Solve( InputIteratorT h_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value) { using AccumT = cub::detail::accumulator_t< ScanOpT, InitialValueT, cub::detail::value_t>; if (num_items > 0) { AccumT val = static_cast(h_in[0]); h_reference[0] = initial_value; AccumT inclusive = static_cast(scan_op(initial_value, val)); for (int i = 1; i < num_items; ++i) { val = static_cast(h_in[i]); h_reference[i] = static_cast(inclusive); inclusive = static_cast(scan_op(inclusive, val)); } } } /** * Solve inclusive-scan problem */ template < typename InputIteratorT, typename OutputT, typename ScanOpT> void Solve( InputIteratorT h_in, OutputT *h_reference, int num_items, ScanOpT scan_op, NullType) { using AccumT = cub::detail::accumulator_t< ScanOpT, cub::detail::value_t, cub::detail::value_t>; if (num_items > 0) { AccumT inclusive = h_in[0]; h_reference[0] = static_cast(inclusive); for (int i = 1; i < num_items; ++i) { AccumT val = h_in[i]; inclusive = static_cast(scan_op(inclusive, val)); h_reference[i] = static_cast(inclusive); } } } template struct AllocateOutput { static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items)); } }; template struct AllocateOutput { static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) { d_out = d_in; } }; /** * Test DeviceScan for a given problem input */ template < Backend BACKEND, typename DeviceInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT, bool InPlace=false> void Test( DeviceInputIteratorT d_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value) { using InputT = cub::detail::value_t; // Allocate device output array OutputT *d_out = NULL; AllocateOutput::run(d_out, d_in, num_items); // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch( Int2Type(), Int2Type(), Int2Type::PRIMITIVE>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); if (!InPlace) { // Clear device output array CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items)); } // Run warmup/correctness iteration CubDebugExit(Dispatch( Int2Type(), Int2Type(), Int2Type::PRIMITIVE>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance if (g_timing_iterations > 0) { GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type::PRIMITIVE>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT)); printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% " "peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Cleanup if (!InPlace) { if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } } if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare); } template __global__ void FillInitValue(InitialValueT *ptr, InitialValueT initial_value) { *ptr = initial_value; } template < Backend BACKEND, typename DeviceInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT> typename std::enable_if::value>::type TestFutureInitValue( DeviceInputIteratorT d_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value) { // Allocate device initial_value InitialValueT *d_initial_value = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_initial_value, sizeof(InitialValueT))); FillInitValue<<<1, 1>>>(d_initial_value, initial_value); // Run test auto future_init_value = cub::FutureValue(d_initial_value); Test(d_in, h_reference, num_items, scan_op, future_init_value); // Cleanup if (d_initial_value) CubDebugExit(g_allocator.DeviceFree(d_initial_value)); } template < Backend BACKEND, typename DeviceInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT> typename std::enable_if::value>::type TestFutureInitValue( DeviceInputIteratorT, OutputT *, int, ScanOpT, InitialValueT) { // cub::NullType does not have device pointer, so nothing to do here } template < Backend BACKEND, typename DeviceInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT> typename std::enable_if::value>::type TestFutureInitValueIter( DeviceInputIteratorT d_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value) { using IterT = cub::ConstantInputIterator; IterT iter(initial_value); auto future_init_value = cub::FutureValue(iter); Test(d_in, h_reference, num_items, scan_op, future_init_value); } template < Backend BACKEND, typename DeviceInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT> typename std::enable_if::value>::type TestFutureInitValueIter( DeviceInputIteratorT, OutputT *, int, ScanOpT, InitialValueT) { // cub::NullType does not have device pointer, so nothing to do here } template void TestInplace(OutputT *d_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value) { Test(d_in, h_reference, num_items, scan_op, initial_value); } template void TestInplace(DeviceInputIteratorT, OutputT *, int, ScanOpT, InitialValueT) {} /** * Test DeviceScan on pointer type */ template < Backend BACKEND, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT> void TestPointer( int num_items, GenMode gen_mode, ScanOpT scan_op, InitialValueT initial_value) { printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_same::value) ? "Inclusive" : "Exclusive", (std::is_same::value) ? "Sum" : "Scan", num_items, typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT), (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS"); fflush(stdout); // Allocate host arrays InputT* h_in = new InputT[num_items]; OutputT* h_reference = new OutputT[num_items]; // Initialize problem and solution Initialize(gen_mode, h_in, num_items); // If the output type is primitive and the operator is cub::Sum, the test // dispatcher throws away scan_op and initial_value for exclusive scan. // Without an initial_value arg, the accumulator switches to the input value // type. // Do the same thing here: if (Traits::PRIMITIVE && std::is_same::value && !std::is_same::value) { Solve(h_in, h_reference, num_items, cub::Sum{}, InputT{}); } else { Solve(h_in, h_reference, num_items, scan_op, initial_value); } // Allocate problem device arrays InputT *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_in, h_reference, num_items, scan_op, initial_value); TestFutureInitValue(d_in, h_reference, num_items, scan_op, initial_value); TestInplace(d_in, h_reference, num_items, scan_op, initial_value); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /** * Test DeviceScan on iterator type */ template < Backend BACKEND, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT> void TestIterator( int num_items, ScanOpT scan_op, InitialValueT initial_value) { printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_same::value) ? "Inclusive" : "Exclusive", (std::is_same::value) ? "Sum" : "Scan", num_items, typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT)); fflush(stdout); // Use a constant iterator as the input InputT val = InputT(); ConstantInputIterator h_in(val); // Allocate host arrays OutputT* h_reference = new OutputT[num_items]; // Initialize problem and solution Solve(h_in, h_reference, num_items, scan_op, initial_value); // Run Test Test(h_in, h_reference, num_items, scan_op, initial_value); TestFutureInitValueIter(h_in, h_reference, num_items, scan_op, initial_value); // Cleanup if (h_reference) delete[] h_reference; } /** * Test different gen modes */ template < Backend BACKEND, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT> void Test( int num_items, ScanOpT scan_op, InitialValueT initial_value) { TestPointer( num_items, UNIFORM, scan_op, initial_value); TestPointer( num_items, RANDOM, scan_op, initial_value); TestIterator( num_items, scan_op, initial_value); } /** * Test different dispatch */ template < typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT> void Test( int num_items, ScanOpT scan_op, InitialValueT initial_value) { #if TEST_CDP == 0 Test(num_items, scan_op, initial_value); #elif TEST_CDP == 1 Test(num_items, scan_op, initial_value); #endif // TEST_CDP } /** * Test different operators */ template void TestOp( int num_items, OutputT identity, OutputT initial_value) { // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values) Test(num_items, cub::Sum(), identity); Test(num_items, cub::Max(), identity); // Exclusive (non-specialized, so we can test initial-value) Test(num_items, WrapperFunctor(cub::Sum()), initial_value); Test(num_items, WrapperFunctor(cub::Max()), initial_value); // Inclusive (no initial value) Test(num_items, cub::Sum(), NullType()); Test(num_items, cub::Max(), NullType()); } /** * Test different input sizes */ template < typename InputT, typename OutputT> void TestSize( int num_items, OutputT identity, OutputT initial_value) { if (num_items < 0) { TestOp(0, identity, initial_value); TestOp(1, identity, initial_value); TestOp(100, identity, initial_value); TestOp(10000, identity, initial_value); TestOp(1000000, identity, initial_value); } else { TestOp(num_items, identity, initial_value); } } class CustomInputT { char m_val{}; public: __host__ __device__ explicit CustomInputT(char val) : m_val(val) {} __host__ __device__ int get() const { return static_cast(m_val); } }; class CustomAccumulatorT { int m_val{0}; int m_magic_value{42}; __host__ __device__ CustomAccumulatorT(int val) : m_val(val) {} public: __host__ __device__ CustomAccumulatorT() {} __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in) : m_val(in.is_valid() * in.get()) , m_magic_value(in.is_valid() * 42) {} __host__ __device__ CustomAccumulatorT(const CustomInputT &in) : m_val(in.get()) , m_magic_value(42) {} __host__ __device__ void operator=(const CustomInputT &in) { if (this->is_valid()) { m_val = in.get(); } } __host__ __device__ void operator=(const CustomAccumulatorT &in) { if (this->is_valid() && in.is_valid()) { m_val = in.get(); } } __host__ __device__ CustomAccumulatorT operator+(const CustomInputT &in) const { const int multiplier = this->is_valid(); return {(m_val + in.get()) * multiplier}; } __host__ __device__ CustomAccumulatorT operator+(const CustomAccumulatorT &in) const { const int multiplier = this->is_valid() && in.is_valid(); return {(m_val + in.get()) * multiplier}; } __host__ __device__ int get() const { return m_val; } __host__ __device__ bool is_valid() const { return m_magic_value == 42; } }; class CustomOutputT { int *m_d_ok_count{}; int m_expected{}; public: __host__ __device__ CustomOutputT(int *d_ok_count, int expected) : m_d_ok_count(d_ok_count) , m_expected(expected) {} __device__ void operator=(const CustomAccumulatorT &accum) const { const int ok = accum.is_valid() && (accum.get() == m_expected); atomicAdd(m_d_ok_count, ok); } }; __global__ void InitializeTestAccumulatorTypes(int num_items, int *d_ok_count, CustomInputT *d_in, CustomOutputT *d_out) { const int idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (idx < num_items) { d_in[idx] = CustomInputT(1); d_out[idx] = CustomOutputT{d_ok_count, idx}; } } void TestAccumulatorTypes() { const int num_items = 2 * 1024 * 1024; const int block_size = 256; const int grid_size = (num_items + block_size - 1) / block_size; CustomInputT *d_in{}; CustomOutputT *d_out{}; CustomAccumulatorT init{}; int *d_ok_count{}; CubDebugExit(g_allocator.DeviceAllocate((void **)&d_ok_count, sizeof(int))); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, sizeof(CustomInputT) * num_items)); InitializeTestAccumulatorTypes<<>>(num_items, d_ok_count, d_in, d_out); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, cub::Sum{}, init, num_items)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cudaMemset(d_temp_storage, 1, temp_storage_bytes)); CubDebugExit(cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, cub::Sum{}, init, num_items)); int ok{}; CubDebugExit(cudaMemcpy(&ok, d_ok_count, sizeof(int), cudaMemcpyDeviceToHost)); AssertEquals(ok, num_items); CubDebugExit(g_allocator.DeviceFree(d_out)); CubDebugExit(g_allocator.DeviceFree(d_in)); CubDebugExit(g_allocator.DeviceFree(d_ok_count)); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_VALUE_TYPES types 0:1:2 #if TEST_VALUE_TYPES == 0 // Test different input+output data types TestSize(num_items, (int)0, (int)99); // Test same input+output data types TestSize(num_items, (unsigned char)0, (unsigned char)99); TestSize(num_items, (char)0, (char)99); TestSize(num_items, (unsigned short)0, (unsigned short)99); TestSize(num_items, (unsigned int)0, (unsigned int)99); TestSize(num_items, (unsigned long long)0, (unsigned long long)99); #elif TEST_VALUE_TYPES == 1 TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); TestSize(num_items, make_ulonglong2(0, 0), make_ulonglong2(17, 21)); TestSize(num_items, make_uchar4(0, 0, 0, 0), make_uchar4(17, 21, 32, 85)); #elif TEST_VALUE_TYPES == 2 TestSize(num_items, make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); TestSize(num_items, make_ushort4(0, 0, 0, 0), make_ushort4(17, 21, 32, 85)); TestSize(num_items, make_uint4(0, 0, 0, 0), make_uint4(17, 21, 32, 85)); TestSize(num_items, make_ulonglong4(0, 0, 0, 0), make_ulonglong4(17, 21, 32, 85)); TestSize(num_items, TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(1ll << 63, 1 << 31, static_cast(1 << 15), static_cast(1 << 7))); TestSize(num_items, TestBar(0, 0), TestBar(1ll << 63, 1 << 31)); TestAccumulatorTypes(); #endif return 0; } cub-2.0.1/test/test_device_scan_by_key.cu000066400000000000000000001071621434614775400204650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceScan utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; double g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; enum AliasMode { AliasNone, // output is allocated AliasKeys, // output is an alias of input keys AliasValues // output is an alias of input values }; /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ template struct WrapperFunctor { OpT op; WrapperFunctor(OpT op) : op(op) {} template __host__ __device__ __forceinline__ auto operator()(const T &a, const U &b) const -> decltype(op(a, b)) { return static_cast(op(a, b)); } }; /** * \brief DivideByFiveFunctor (used by TestIterator) */ template struct DivideByFiveFunctor { template __host__ __device__ __forceinline__ OutputT operator()(const T &a) const { return static_cast(a / 5); } }; /** * \brief Mod2Equality (used for non-bool keys to make keys more likely to equal each other) */ struct Mod2Equality { template __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const { return (a % 2) == (b % 2); } }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceScan entrypoints //--------------------------------------------------------------------- /** * Dispatch to exclusive scan entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, IsPrimitiveT /*is_primitive*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items, EqualityOpT equality_op) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op); } return error; } /** * Dispatch to exclusive sum entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, Int2Type /*is_primitive*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, Sum /*scan_op*/, InitialValueT /*initial_value*/, OffsetT num_items, EqualityOpT equality_op) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op); } return error; } /** * Dispatch to inclusive scan entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, IsPrimitiveT /*is_primitive*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, NullType /*initial_value*/, OffsetT num_items, EqualityOpT equality_op) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op); } return error; } /** * Dispatch to inclusive sum entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, Int2Type /*is_primitive*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, Sum /*scan_op*/, NullType /*initial_value*/, OffsetT num_items, EqualityOpT equality_op) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceScan */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, IsPrimitiveT is_primitive, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items, EqualityOpT equality_op) { *d_cdp_error = Dispatch(cub_backend, is_primitive, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template cudaError_t Dispatch(Int2Type /*dispatch_to*/, IsPrimitiveT is_primitive, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitialValueT initial_value, OffsetT num_items, EqualityOpT equality_op) { // Invoke kernel to invoke device-side dispatch cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, Int2Type{}, is_primitive, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( GenMode gen_mode, T *h_in, int num_items) { for (int i = 0; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve exclusive-scan problem */ template < typename KeysInputIteratorT, typename ValuesInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT, typename EqualityOpT> void Solve( KeysInputIteratorT h_keys_in, ValuesInputIteratorT h_values_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { using ValueT = cub::detail::value_t; using AccumT = cub::detail::accumulator_t; if (num_items > 0) { for (int i = 0; i < num_items;) { AccumT val = static_cast(h_values_in[i]); h_reference[i] = initial_value; AccumT inclusive = static_cast(scan_op(initial_value, val)); ++i; for (; i < num_items && equality_op(h_keys_in[i - 1], h_keys_in[i]); ++i) { val = static_cast(h_values_in[i]); h_reference[i] = static_cast(inclusive); inclusive = static_cast(scan_op(inclusive, val)); } } } } /** * Solve inclusive-scan problem */ template < typename KeysInputIteratorT, typename ValuesInputIteratorT, typename OutputT, typename ScanOpT, typename EqualityOpT> void Solve( KeysInputIteratorT h_keys_in, ValuesInputIteratorT h_values_in, OutputT *h_reference, int num_items, ScanOpT scan_op, NullType /*initial_value*/, EqualityOpT equality_op) { using ValueT = cub::detail::value_t; using AccumT = cub::detail::accumulator_t; if (num_items > 0) { for (int i = 0; i < num_items;) { AccumT inclusive = h_values_in[i]; h_reference[i] = static_cast(inclusive); ++i; for (; i < num_items && equality_op(h_keys_in[i - 1], h_keys_in[i]); ++i) { AccumT val = h_values_in[i]; inclusive = static_cast(scan_op(inclusive, val)); h_reference[i] = static_cast(inclusive); } } } } template struct AllocateOutput { static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items)); } }; template struct AllocateOutput { static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) { d_out = d_in; } }; /** * Test DeviceScan for a given problem input */ template < Backend BACKEND, typename KeysInputIteratorT, typename ValuesInputIteratorT, typename OutputT, typename ScanOpT, typename InitialValueT, typename EqualityOpT, AliasMode Mode=AliasNone> void Test( KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { using KeyT = cub::detail::value_t; using InputT = cub::detail::value_t; // Allocate device output array OutputT *d_values_out = NULL; if (Mode == AliasKeys) { AllocateOutput::run( d_values_out, d_keys_in, num_items); } else { AllocateOutput::run( d_values_out, d_values_in, num_items); } // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch( Int2Type(), Int2Type::PRIMITIVE>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Clear device output array if (Mode == AliasNone) { CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items)); } // Run warmup/correctness iteration CubDebugExit(Dispatch( Int2Type(), Int2Type::PRIMITIVE>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op)); // Check for correctness (and display results, if specified) const int compare = CompareDeviceResults(h_reference, d_values_out, num_items, true, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Display performance if (g_timing_iterations > 0) { // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), Int2Type::PRIMITIVE>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT)); printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Cleanup if (Mode == AliasNone) { if (d_values_out) { CubDebugExit(g_allocator.DeviceFree(d_values_out)); } } if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare); } template void TestInplaceValues(KeysInputIteratorT d_keys_in, OutputT *d_values_in, OutputT *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { Test(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); } template void TestInplaceValues(KeysInputIteratorT, ValuesInputIteratorT, OutputT *, int, ScanOpT, InitialValueT, EqualityOpT) {} template void TestInplaceKeys(T *d_keys_in, ValuesInputIteratorT d_values_in, T *h_reference, int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { Test(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); } template void TestInplaceKeys(KeysInputIteratorT, ValuesInputIteratorT, OutputT *, int, ScanOpT, InitialValueT, EqualityOpT) {} /** * Test DeviceScan on pointer type */ template < Backend BACKEND, typename KeyT, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT, typename EqualityOpT> void TestPointer( int num_items, GenMode gen_mode, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_same::value) ? "Inclusive" : "Exclusive", (std::is_same::value) ? "Sum" : "Scan", num_items, typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT), (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS"); fflush(stdout); // Allocate host arrays KeyT* h_keys_in = new KeyT[num_items]; InputT* h_values_in = new InputT[num_items]; OutputT* h_reference = new OutputT[num_items]; // Initialize problem and solution Initialize(gen_mode, h_keys_in, num_items); Initialize(gen_mode, h_values_in, num_items); // If the output type is primitive and the operator is cub::Sum, the test // dispatcher throws away scan_op and initial_value for exclusive scan. // Without an initial_value arg, the accumulator switches to the input value // type. // Do the same thing here: if (Traits::PRIMITIVE && std::is_same::value && !std::is_same::value) { Solve(h_keys_in, h_values_in, h_reference, num_items, cub::Sum{}, InputT{}, equality_op); } else { Solve(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op); } // Allocate problem device arrays KeyT *d_keys_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); InputT *d_values_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(InputT) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); // Test in/out values aliasing TestInplaceValues(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); // Test keys/values aliasing (should go last, changes keys) TestInplaceKeys(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); // Cleanup if (h_keys_in) delete[] h_keys_in; if (h_values_in) delete[] h_values_in; if (h_reference) delete[] h_reference; if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in)); } /** * Test DeviceScan on iterator type */ template < Backend BACKEND, typename KeyT, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT, typename EqualityOpT> void TestIterator( int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_same::value) ? "Inclusive" : "Exclusive", (std::is_same::value) ? "Sum" : "Scan", num_items, typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT)); fflush(stdout); // Use a counting iterator followed by div as the keys using CountingIterT = CountingInputIterator; CountingIterT h_keys_in_helper(0); TransformInputIterator, CountingIterT> h_keys_in(h_keys_in_helper, DivideByFiveFunctor()); // Use a constant iterator as the input InputT val = InputT(); ConstantInputIterator h_values_in(val); // Allocate host arrays OutputT* h_reference = new OutputT[num_items]; // Initialize problem and solution Solve(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op); // Run Test Test(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op); // Cleanup if (h_reference) delete[] h_reference; } /** * Test different gen modes */ template < Backend BACKEND, typename KeyT, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT, typename EqualityOpT> void Test( int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { TestPointer( num_items, UNIFORM, scan_op, initial_value, equality_op); TestPointer( num_items, RANDOM, scan_op, initial_value, equality_op); TestIterator( num_items, scan_op, initial_value, equality_op); } /** * Test different dispatch */ template < typename KeyT, typename InputT, typename OutputT, typename ScanOpT, typename InitialValueT, typename EqualityOpT> void Test( int num_items, ScanOpT scan_op, InitialValueT initial_value, EqualityOpT equality_op) { #if TEST_CDP == 0 Test(num_items, scan_op, initial_value, equality_op); #elif TEST_CDP == 1 Test(num_items, scan_op, initial_value, equality_op); #endif // TEST_CDP } /** * Test different operators */ template void TestOp( int num_items, OutputT identity, OutputT initial_value, EqualityOpT equality_op) { // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values) Test(num_items, cub::Sum(), identity, equality_op); Test(num_items, cub::Max(), identity, equality_op); // Exclusive (non-specialized, so we can test initial-value) Test(num_items, WrapperFunctor(cub::Sum()), initial_value, equality_op); Test(num_items, WrapperFunctor(cub::Max()), initial_value, equality_op); // Inclusive (no initial value) Test(num_items, cub::Sum(), NullType(), equality_op); Test(num_items, cub::Max(), NullType(), equality_op); } /** * Test different key type and equality operator */ template void TestKeyTAndEqualityOp( int num_items, OutputT identity, OutputT initial_value) { TestOp(num_items, identity, initial_value, Equality()); TestOp( num_items, identity, initial_value, Mod2Equality()); } /** * Test different input sizes */ template < typename InputT, typename OutputT> void TestSize( int num_items, OutputT identity, OutputT initial_value) { if (num_items < 0) { TestKeyTAndEqualityOp(0, identity, initial_value); TestKeyTAndEqualityOp(1, identity, initial_value); TestKeyTAndEqualityOp(100, identity, initial_value); TestKeyTAndEqualityOp(10000, identity, initial_value); TestKeyTAndEqualityOp(1000000, identity, initial_value); } else { TestKeyTAndEqualityOp(num_items, identity, initial_value); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5 #if TEST_VALUE_TYPES == 0 // Test different input+output data types TestSize(num_items, (int)0, (int)99); // Test same input+output data types TestSize(num_items, (unsigned char)0, (unsigned char)99); TestSize(num_items, (char)0, (char)99); #elif TEST_VALUE_TYPES == 1 TestSize(num_items, (unsigned short)0, (unsigned short)99); TestSize(num_items, (unsigned int)0, (unsigned int)99); TestSize(num_items, (unsigned long long)0, (unsigned long long)99); #elif TEST_VALUE_TYPES == 2 TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); #elif TEST_VALUE_TYPES == 3 TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); TestSize(num_items, make_ulonglong2(0, 0), make_ulonglong2(17, 21)); TestSize(num_items, make_uchar4(0, 0, 0, 0), make_uchar4(17, 21, 32, 85)); #elif TEST_VALUE_TYPES == 4 TestSize(num_items, make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); TestSize(num_items, make_ushort4(0, 0, 0, 0), make_ushort4(17, 21, 32, 85)); TestSize(num_items, make_uint4(0, 0, 0, 0), make_uint4(17, 21, 32, 85)); #elif TEST_VALUE_TYPES == 5 TestSize(num_items, make_ulonglong4(0, 0, 0, 0), make_ulonglong4(17, 21, 32, 85)); TestSize(num_items, TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(1ll << 63, 1 << 31, static_cast(1 << 15), static_cast(1 << 7))); TestSize(num_items, TestBar(0, 0), TestBar(1ll << 63, 1 << 31)); #endif return 0; } cub-2.0.1/test/test_device_segmented_sort.cu000066400000000000000000001651121434614775400212200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #define TEST_HALF_T \ (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA #define TEST_BF_T \ (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA #if TEST_HALF_T #include #endif #if TEST_BF_T #include #endif using namespace cub; template struct UnwrapHalfAndBfloat16 { using Type = T; }; #if TEST_HALF_T template <> struct UnwrapHalfAndBfloat16 { using Type = __half; }; #endif #if TEST_BF_T template <> struct UnwrapHalfAndBfloat16 { using Type = __nv_bfloat16; }; #endif constexpr static int MAX_ITERATIONS = 2; class SizeGroupDescription { public: SizeGroupDescription(const int segments, const int segment_size) : segments(segments) , segment_size(segment_size) {} int segments {}; int segment_size {}; }; template struct SegmentChecker { const KeyT *sorted_keys {}; const int *offsets {}; SegmentChecker(const KeyT *sorted_keys, const int *offsets) : sorted_keys(sorted_keys) , offsets(offsets) {} bool operator()(int segment_id) { const int segment_begin = offsets[segment_id]; const int segment_end = offsets[segment_id + 1]; int counter = 0; for (int i = segment_begin; i < segment_end; i++) { if (sorted_keys[i] != static_cast(counter++)) { return false; } } return true; } }; template struct DescendingSegmentChecker { const KeyT *sorted_keys{}; const int *offsets{}; DescendingSegmentChecker(const KeyT *sorted_keys, const int *offsets) : sorted_keys(sorted_keys) , offsets(offsets) {} bool operator()(int segment_id) { const int segment_begin = offsets[segment_id]; const int segment_end = offsets[segment_id + 1]; int counter = 0; for (int i = segment_end - 1; i >= segment_begin; i--) { if (sorted_keys[i] != static_cast(counter++)) { return false; } } return true; } }; template struct ReversedIota { KeyT *data {}; const int *offsets {}; ReversedIota(KeyT *data, const int *offsets) : data(data) , offsets(offsets) {} void operator()(int segment_id) const { const int segment_begin = offsets[segment_id]; const int segment_end = offsets[segment_id + 1]; const int segment_size = segment_end - segment_begin; int count = 0; for (int i = segment_begin; i < segment_end; i++) { data[i] = static_cast(segment_size - 1 - count++); } } }; template struct Iota { KeyT *data{}; const int *offsets{}; Iota(KeyT *data, const int *offsets) : data(data) , offsets(offsets) {} void operator()(int segment_id) const { const int segment_begin = offsets[segment_id]; const int segment_end = offsets[segment_id + 1]; int count = 0; for (int i = segment_begin; i < segment_end; i++) { data[i] = static_cast(count++); } } }; template class Input { thrust::default_random_engine random_engine; thrust::device_vector d_segment_sizes; thrust::device_vector d_offsets; thrust::host_vector h_offsets; using MaskedValueT = cub::detail::conditional_t< std::is_same::value, KeyT, ValueT>; bool reverse {}; int num_items {}; thrust::device_vector d_keys; thrust::device_vector d_values; thrust::host_vector h_keys; thrust::host_vector h_values; public: Input(bool reverse, const thrust::host_vector &h_segment_sizes) : d_segment_sizes(h_segment_sizes) , d_offsets(d_segment_sizes.size() + 1) , h_offsets(d_segment_sizes.size() + 1) , reverse(reverse) , num_items(static_cast( thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()))) , d_keys(num_items) , d_values(num_items) , h_keys(num_items) , h_values(num_items) { update(); } Input(thrust::host_vector &h_offsets) : d_offsets(h_offsets) , h_offsets(h_offsets) , reverse(false) , num_items(h_offsets.back()) , d_keys(num_items) , d_values(num_items) { } void shuffle() { thrust::shuffle(d_segment_sizes.begin(), d_segment_sizes.end(), random_engine); update(); } int get_num_items() const { return num_items; } int get_num_segments() const { return static_cast(d_segment_sizes.size()); } const KeyT *get_d_keys() const { return thrust::raw_pointer_cast(d_keys.data()); } thrust::device_vector &get_d_keys_vec() { return d_keys; } thrust::device_vector &get_d_values_vec() { return d_values; } KeyT *get_d_keys() { return thrust::raw_pointer_cast(d_keys.data()); } const thrust::host_vector& get_h_offsets() { return h_offsets; } MaskedValueT *get_d_values() { return thrust::raw_pointer_cast(d_values.data()); } const int *get_d_offsets() const { return thrust::raw_pointer_cast(d_offsets.data()); } template bool check_output_implementation(const T *keys_output) { const int *offsets = thrust::raw_pointer_cast(h_offsets.data()); if (reverse) { DescendingSegmentChecker checker{keys_output, offsets}; for (int i = 0; i < get_num_segments(); i++) { if (!checker(i)) { return false; } } } else { SegmentChecker checker{keys_output, offsets}; for (int i = 0; i < get_num_segments(); i++) { if (!checker(i)) { return false; } } } return true; } bool check_output(const KeyT *d_keys_output, const MaskedValueT *d_values_output = nullptr) { KeyT *keys_output = thrust::raw_pointer_cast(h_keys.data()); MaskedValueT *values_output = thrust::raw_pointer_cast(h_values.data()); cudaMemcpy(keys_output, d_keys_output, sizeof(KeyT) * num_items, cudaMemcpyDeviceToHost); const bool keys_ok = check_output_implementation(keys_output); if (std::is_same::value || d_values_output == nullptr) { return keys_ok; } cudaMemcpy(values_output, d_values_output, sizeof(ValueT) * num_items, cudaMemcpyDeviceToHost); const bool values_ok = check_output_implementation(values_output); return keys_ok && values_ok; } private: void update() { fill_offsets(); gen_keys(); } void fill_offsets() { thrust::copy(d_segment_sizes.begin(), d_segment_sizes.end(), d_offsets.begin()); thrust::exclusive_scan(d_offsets.begin(), d_offsets.end(), d_offsets.begin(), 0u); thrust::copy(d_offsets.begin(), d_offsets.end(), h_offsets.begin()); } void gen_keys() { KeyT *keys_output = thrust::raw_pointer_cast(h_keys.data()); const int *offsets = thrust::raw_pointer_cast(h_offsets.data()); if (reverse) { Iota generator{keys_output, offsets}; for (int i = 0; i < get_num_segments(); i++) { generator(i); } } else { ReversedIota generator{keys_output, offsets}; for (int i = 0; i < get_num_segments(); i++) { generator(i); } } d_keys = h_keys; d_values = d_keys; } }; template ::value> class InputDescription { thrust::host_vector segment_sizes; public: InputDescription& add(const SizeGroupDescription &group) { if (static_cast(group.segment_size) < static_cast((std::numeric_limits::max)())) { for (int i = 0; i < group.segments; i++) { segment_sizes.push_back(group.segment_size); } } return *this; } template Input gen(bool reverse) { return Input(reverse, segment_sizes); } }; template class InputDescription { thrust::host_vector segment_sizes; public: InputDescription& add(const SizeGroupDescription &group) { for (int i = 0; i < group.segments; i++) { segment_sizes.push_back(group.segment_size); } return *this; } template Input gen(bool reverse) { return Input(reverse, segment_sizes); } }; template void Sort(bool pairs, bool descending, bool double_buffer, bool stable_sort, void *tmp_storage, std::size_t &temp_storage_bytes, WrappedKeyT *wrapped_input_keys, WrappedKeyT *wrapped_output_keys, ValueT *input_values, ValueT *output_values, int num_items, int num_segments, const int *d_offsets, int *keys_selector = nullptr, int *values_selector = nullptr) { using KeyT = typename UnwrapHalfAndBfloat16::Type; auto input_keys = reinterpret_cast(wrapped_input_keys); auto output_keys = reinterpret_cast(wrapped_output_keys); if (stable_sort) { if (pairs) { if (descending) { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; cub::DoubleBuffer values_buffer(input_values, output_values); values_buffer.selector = *values_selector; CubDebugExit(cub::DeviceSegmentedSort::StableSortPairsDescending( tmp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; *values_selector = values_buffer.selector; } else { CubDebugExit(cub::DeviceSegmentedSort::StableSortPairsDescending( tmp_storage, temp_storage_bytes, input_keys, output_keys, input_values, output_values, num_items, num_segments, d_offsets, d_offsets + 1)); } } else { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; cub::DoubleBuffer values_buffer(input_values, output_values); values_buffer.selector = *values_selector; CubDebugExit( cub::DeviceSegmentedSort::StableSortPairs(tmp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; *values_selector = values_buffer.selector; } else { CubDebugExit( cub::DeviceSegmentedSort::StableSortPairs(tmp_storage, temp_storage_bytes, input_keys, output_keys, input_values, output_values, num_items, num_segments, d_offsets, d_offsets + 1)); } } } else { if (descending) { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; CubDebugExit(cub::DeviceSegmentedSort::StableSortKeysDescending( tmp_storage, temp_storage_bytes, keys_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; } else { CubDebugExit(cub::DeviceSegmentedSort::StableSortKeysDescending( tmp_storage, temp_storage_bytes, input_keys, output_keys, num_items, num_segments, d_offsets, d_offsets + 1)); } } else { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; CubDebugExit( cub::DeviceSegmentedSort::StableSortKeys(tmp_storage, temp_storage_bytes, keys_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; } else { CubDebugExit( cub::DeviceSegmentedSort::StableSortKeys(tmp_storage, temp_storage_bytes, input_keys, output_keys, num_items, num_segments, d_offsets, d_offsets + 1)); } } } } else { if (pairs) { if (descending) { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; cub::DoubleBuffer values_buffer(input_values, output_values); values_buffer.selector = *values_selector; CubDebugExit( cub::DeviceSegmentedSort::SortPairsDescending(tmp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; *values_selector = values_buffer.selector; } else { CubDebugExit( cub::DeviceSegmentedSort::SortPairsDescending(tmp_storage, temp_storage_bytes, input_keys, output_keys, input_values, output_values, num_items, num_segments, d_offsets, d_offsets + 1)); } } else { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; cub::DoubleBuffer values_buffer(input_values, output_values); values_buffer.selector = *values_selector; CubDebugExit(cub::DeviceSegmentedSort::SortPairs(tmp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; *values_selector = values_buffer.selector; } else { CubDebugExit(cub::DeviceSegmentedSort::SortPairs(tmp_storage, temp_storage_bytes, input_keys, output_keys, input_values, output_values, num_items, num_segments, d_offsets, d_offsets + 1)); } } } else { if (descending) { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; CubDebugExit( cub::DeviceSegmentedSort::SortKeysDescending(tmp_storage, temp_storage_bytes, keys_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; } else { CubDebugExit( cub::DeviceSegmentedSort::SortKeysDescending(tmp_storage, temp_storage_bytes, input_keys, output_keys, num_items, num_segments, d_offsets, d_offsets + 1)); } } else { if (double_buffer) { cub::DoubleBuffer keys_buffer(input_keys, output_keys); keys_buffer.selector = *keys_selector; CubDebugExit(cub::DeviceSegmentedSort::SortKeys(tmp_storage, temp_storage_bytes, keys_buffer, num_items, num_segments, d_offsets, d_offsets + 1)); *keys_selector = keys_buffer.selector; } else { CubDebugExit(cub::DeviceSegmentedSort::SortKeys(tmp_storage, temp_storage_bytes, input_keys, output_keys, num_items, num_segments, d_offsets, d_offsets + 1)); } } } } } template std::size_t Sort(bool pairs, bool descending, bool double_buffer, bool stable_sort, KeyT *input_keys, KeyT *output_keys, ValueT *input_values, ValueT *output_values, int num_items, int num_segments, const int *d_offsets, int *keys_selector = nullptr, int *values_selector = nullptr) { std::size_t temp_storage_bytes = 42ul; Sort(pairs, descending, double_buffer, stable_sort, nullptr, temp_storage_bytes, input_keys, output_keys, input_values, output_values, num_items, num_segments, d_offsets, keys_selector, values_selector); thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); Sort(pairs, descending, double_buffer, stable_sort, d_temp_storage, temp_storage_bytes, input_keys, output_keys, input_values, output_values, num_items, num_segments, d_offsets, keys_selector, values_selector); return temp_storage_bytes; } constexpr bool keys_only = false; constexpr bool pairs = true; constexpr bool ascending = false; constexpr bool descending = true; constexpr bool pointers = false; constexpr bool double_buffer = true; constexpr bool unstable = false; constexpr bool stable = true; void TestZeroSegments() { // Type doesn't affect the escape logic, so it should be fine // to test only one set of types here. using KeyT = std::uint8_t; using ValueT = std::uint64_t; for (bool stable_sort: { unstable, stable }) { for (bool sort_pairs: { keys_only, pairs }) { for (bool sort_descending: { ascending, descending }) { for (bool sort_buffer: { pointers, double_buffer }) { cub::DoubleBuffer keys_buffer(nullptr, nullptr); cub::DoubleBuffer values_buffer(nullptr, nullptr); values_buffer.selector = 1; Sort(sort_pairs, sort_descending, sort_buffer, stable_sort, nullptr, nullptr, nullptr, nullptr, int{}, int{}, nullptr, &keys_buffer.selector, &values_buffer.selector); AssertEquals(keys_buffer.selector, 0); AssertEquals(values_buffer.selector, 1); } } } } } void TestEmptySegments(int segments) { // Type doesn't affect the escape logic, so it should be fine // to test only one set of types here. using KeyT = std::uint8_t; using ValueT = std::uint64_t; thrust::device_vector offsets(segments + 1, int{}); const int *d_offsets = thrust::raw_pointer_cast(offsets.data()); for (bool sort_stable: { unstable, stable }) { for (bool sort_pairs: { keys_only, pairs }) { for (bool sort_descending: { ascending, descending }) { for (bool sort_buffer: { pointers, double_buffer }) { cub::DoubleBuffer keys_buffer(nullptr, nullptr); cub::DoubleBuffer values_buffer(nullptr, nullptr); values_buffer.selector = 1; Sort(sort_pairs, sort_descending, sort_buffer, sort_stable, nullptr, nullptr, nullptr, nullptr, int{}, segments, d_offsets, &keys_buffer.selector, &values_buffer.selector); AssertEquals(keys_buffer.selector, 0); AssertEquals(values_buffer.selector, 1); } } } } } template void TestSameSizeSegments(int segment_size, int segments, bool skip_values = false) { const int num_items = segment_size * segments; thrust::device_vector offsets(segments + 1); thrust::sequence(offsets.begin(), offsets.end(), int{}, segment_size); const int *d_offsets = thrust::raw_pointer_cast(offsets.data()); const KeyT target_key {42}; const ValueT target_value {42}; thrust::device_vector keys_input(num_items); thrust::device_vector keys_output(num_items); KeyT *d_keys_input = thrust::raw_pointer_cast(keys_input.data()); KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data()); thrust::device_vector values_input(num_items); thrust::device_vector values_output(num_items); thrust::host_vector host_keys(num_items); thrust::host_vector host_values(num_items); ValueT *d_values_input = thrust::raw_pointer_cast(values_input.data()); ValueT *d_values_output = thrust::raw_pointer_cast(values_output.data()); for (bool stable_sort: { unstable, stable }) { for (bool sort_pairs: { keys_only, pairs }) { if (sort_pairs) { if (skip_values) { continue; } } for (bool sort_descending: { ascending, descending }) { for (bool sort_buffers: { pointers, double_buffer }) { cub::DoubleBuffer keys_buffer(nullptr, nullptr); cub::DoubleBuffer values_buffer(nullptr, nullptr); values_buffer.selector = 1; thrust::fill(keys_input.begin(), keys_input.end(), target_key); thrust::fill(keys_output.begin(), keys_output.end(), KeyT{}); if (sort_pairs) { if (sort_buffers) { thrust::fill(values_input.begin(), values_input.end(), ValueT{}); thrust::fill(values_output.begin(), values_output.end(), target_value); } else { thrust::fill(values_input.begin(), values_input.end(), target_value); thrust::fill(values_output.begin(), values_output.end(), ValueT{}); } } const std::size_t temp_storage_bytes = Sort(sort_pairs, sort_descending, sort_buffers, stable_sort, d_keys_input, d_keys_output, d_values_input, d_values_output, num_items, segments, d_offsets, &keys_buffer.selector, &values_buffer.selector); // If temporary storage size is defined by extra keys storage if (sort_buffers) { if (2 * segments * sizeof(unsigned int) < num_items * sizeof(KeyT)) { std::size_t extra_temp_storage_bytes{}; Sort(sort_pairs, sort_descending, pointers, stable_sort, nullptr, extra_temp_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, num_items, segments, d_offsets, &keys_buffer.selector, &values_buffer.selector); AssertTrue(extra_temp_storage_bytes > temp_storage_bytes); } } { host_keys = keys_buffer.selector || !sort_buffers ? keys_output : keys_input; const std::size_t items_selected = thrust::count(host_keys.begin(), host_keys.end(), target_key); AssertEquals(static_cast(items_selected), num_items); } if (sort_pairs) { host_values = values_buffer.selector || !sort_buffers ? values_output : values_input; const std::size_t items_selected = thrust::count(host_values.begin(), host_values.end(), target_value); AssertEquals(static_cast(items_selected), num_items); } } } } } } template void InputTest(bool sort_descending, Input &input) { thrust::device_vector keys_output(input.get_num_items()); KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data()); thrust::device_vector values_output(input.get_num_items()); ValueT *d_values_output = thrust::raw_pointer_cast(values_output.data()); for (bool stable_sort: { unstable, stable }) { for (bool sort_pairs : { keys_only, pairs }) { for (bool sort_buffers : {pointers, double_buffer}) { for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { thrust::fill(keys_output.begin(), keys_output.end(), KeyT{}); thrust::fill(values_output.begin(), values_output.end(), ValueT{}); cub::DoubleBuffer keys_buffer(input.get_d_keys(), d_keys_output); cub::DoubleBuffer values_buffer(input.get_d_values(), d_values_output); Sort(sort_pairs, sort_descending, sort_buffers, stable_sort, input.get_d_keys(), d_keys_output, input.get_d_values(), d_values_output, input.get_num_items(), input.get_num_segments(), input.get_d_offsets(), &keys_buffer.selector, &values_buffer.selector); if (sort_buffers) { if (sort_pairs) { AssertTrue(input.check_output(keys_buffer.Current(), values_buffer.Current())); } else { AssertTrue(input.check_output(keys_buffer.Current())); } } else { if (sort_pairs) { AssertTrue(input.check_output(d_keys_output, d_values_output)); } else { AssertTrue(input.check_output(d_keys_output)); } } input.shuffle(); } } } } } struct ComparisonPredicate { template __host__ __device__ bool operator()(const T &lhs, const T &rhs) const { return lhs == rhs; } __host__ __device__ bool operator()(const half_t &lhs, const half_t &rhs) const { return lhs.raw() == rhs.raw(); } }; template bool compare_two_outputs(const thrust::host_vector &offsets, const thrust::host_vector &lhs, const thrust::host_vector &rhs) { const auto num_segments = static_cast(offsets.size() - 1); for (std::size_t segment_id = 0; segment_id < num_segments; segment_id++) { auto lhs_begin = lhs.cbegin() + offsets[segment_id]; auto lhs_end = lhs.cbegin() + offsets[segment_id + 1]; auto rhs_begin = rhs.cbegin() + offsets[segment_id]; auto err = thrust::mismatch(lhs_begin, lhs_end, rhs_begin, ComparisonPredicate{}); if (err.first != lhs_end) { const auto idx = thrust::distance(lhs_begin, err.first); const auto segment_size = std::distance(lhs_begin, lhs_end); std::cerr << "Mismatch in segment " << segment_id << " at position " << idx << " / " << segment_size << ": " << static_cast(lhs_begin[idx]) << " vs " << static_cast(rhs_begin[idx]) << " (" << typeid(lhs_begin[idx]).name() << ")" << std::endl; return false; } } return true; } template void RandomizeInput(thrust::host_vector &h_keys, thrust::host_vector &h_values) { for (std::size_t i = 0; i < h_keys.size(); i++) { h_keys[i] = RandomValue((std::numeric_limits::max)()); h_values[i] = RandomValue((std::numeric_limits::max)()); } } #if TEST_HALF_T void RandomizeInput(thrust::host_vector &h_keys, thrust::host_vector &h_values) { for (std::size_t i = 0; i < h_keys.size(); i++) { h_keys[i] = RandomValue((std::numeric_limits::max)()); h_values[i] = RandomValue((std::numeric_limits::max)()); } } #endif #if TEST_BF_T void RandomizeInput(thrust::host_vector &h_keys, thrust::host_vector &h_values) { for (std::size_t i = 0; i < h_keys.size(); i++) { h_keys[i] = RandomValue((std::numeric_limits::max)()); h_values[i] = RandomValue((std::numeric_limits::max)()); } } #endif template void HostReferenceSort(bool sort_pairs, bool sort_descending, unsigned int num_segments, const thrust::host_vector &h_offsets, thrust::host_vector &h_keys, thrust::host_vector &h_values) { for (unsigned int segment_i = 0; segment_i < num_segments; segment_i++) { const int segment_begin = h_offsets[segment_i]; const int segment_end = h_offsets[segment_i + 1]; if (sort_pairs) { if (sort_descending) { thrust::stable_sort_by_key(h_keys.begin() + segment_begin, h_keys.begin() + segment_end, h_values.begin() + segment_begin, thrust::greater{}); } else { thrust::stable_sort_by_key(h_keys.begin() + segment_begin, h_keys.begin() + segment_end, h_values.begin() + segment_begin); } } else { if (sort_descending) { thrust::stable_sort(h_keys.begin() + segment_begin, h_keys.begin() + segment_end, thrust::greater{}); } else { thrust::stable_sort(h_keys.begin() + segment_begin, h_keys.begin() + segment_end); } } } } #if STORE_ON_FAILURE template void DumpInput(bool sort_pairs, bool sort_descending, bool sort_buffers, Input &input, thrust::host_vector &h_keys, thrust::host_vector &h_values) { const thrust::host_vector &h_offsets = input.get_h_offsets(); std::cout << "sort pairs: " << sort_pairs << "\n"; std::cout << "sort descending: " << sort_descending << "\n"; std::cout << "sort buffers: " << sort_buffers << "\n"; std::cout << "num_items: " << input.get_num_items() << "\n"; std::cout << "num_segments: " << input.get_num_segments() << "\n"; std::cout << "key type: " << typeid(h_keys[0]).name() << "\n"; std::cout << "value type: " << typeid(h_values[0]).name() << "\n"; std::cout << "offset type: " << typeid(h_offsets[0]).name() << "\n"; std::ofstream offsets_dump("offsets", std::ios::binary); offsets_dump.write(reinterpret_cast( thrust::raw_pointer_cast(h_offsets.data())), sizeof(int) * h_offsets.size()); std::ofstream keys_dump("keys", std::ios::binary); keys_dump.write(reinterpret_cast( thrust::raw_pointer_cast(h_keys.data())), sizeof(KeyT) * h_keys.size()); std::ofstream values_dump("values", std::ios::binary); values_dump.write(reinterpret_cast( thrust::raw_pointer_cast(h_values.data())), sizeof(ValueT) * h_values.size()); } #endif template void InputTestRandom(Input &input) { thrust::host_vector h_keys_output(input.get_num_items()); thrust::device_vector keys_output(input.get_num_items()); thrust::host_vector h_values_output(input.get_num_items()); thrust::device_vector values_output(input.get_num_items()); KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data()); ValueT *d_values_output = thrust::raw_pointer_cast(values_output.data()); thrust::host_vector h_keys(input.get_num_items()); thrust::host_vector h_values(input.get_num_items()); const thrust::host_vector &h_offsets = input.get_h_offsets(); for (bool stable_sort: { unstable, stable }) { for (bool sort_pairs: { keys_only, pairs }) { for (bool sort_descending: { ascending, descending }) { for (bool sort_buffers: { pointers, double_buffer }) { for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { RandomizeInput(h_keys, h_values); #if STORE_ON_FAILURE auto h_keys_backup = h_keys; auto h_values_backup = h_values; #endif input.get_d_keys_vec() = h_keys; input.get_d_values_vec() = h_values; cub::DoubleBuffer keys_buffer(input.get_d_keys(), d_keys_output); cub::DoubleBuffer values_buffer(input.get_d_values(), d_values_output); Sort(sort_pairs, sort_descending, sort_buffers, stable_sort, input.get_d_keys(), d_keys_output, input.get_d_values(), d_values_output, input.get_num_items(), input.get_num_segments(), input.get_d_offsets(), &keys_buffer.selector, &values_buffer.selector); HostReferenceSort(sort_pairs, sort_descending, input.get_num_segments(), h_offsets, h_keys, h_values); if (sort_buffers) { if (keys_buffer.selector) { h_keys_output = keys_output; } else { h_keys_output = input.get_d_keys_vec(); } if (values_buffer.selector) { h_values_output = values_output; } else { h_values_output = input.get_d_values_vec(); } } else { h_keys_output = keys_output; h_values_output = values_output; } const bool keys_ok = compare_two_outputs(h_offsets, h_keys, h_keys_output); const bool values_ok = sort_pairs ? compare_two_outputs(h_offsets, h_values, h_values_output) : true; #if STORE_ON_FAILURE if (!keys_ok || !values_ok) { DumpInput(sort_pairs, sort_descending, sort_buffers, input, h_keys_backup, h_values_backup); } #endif AssertTrue(keys_ok); AssertTrue(values_ok); input.shuffle(); } } } } } } template ::value> struct EdgeTestDispatch { // Edge cases that needs to be tested const int empty_short_circuit_segment_size = 0; const int copy_short_circuit_segment_size = 1; const int swap_short_circuit_segment_size = 2; const int a_few = 2; const int a_bunch_of = 42; const int a_lot_of = 420; template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { NV_IF_TARGET(NV_IS_HOST, (using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; const int small_segment_max_segment_size = SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE; const int items_per_small_segment = SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD; const int medium_segment_max_segment_size = SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE; const int single_thread_segment_size = items_per_small_segment; const int large_cached_segment_max_segment_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; for (bool sort_descending : {ascending, descending}) { Input edge_cases = InputDescription() .add({a_lot_of, empty_short_circuit_segment_size}) .add({a_lot_of, copy_short_circuit_segment_size}) .add({a_lot_of, swap_short_circuit_segment_size}) .add({a_lot_of, swap_short_circuit_segment_size + 1}) .add({a_lot_of, swap_short_circuit_segment_size + 1}) .add({a_lot_of, single_thread_segment_size - 1}) .add({a_lot_of, single_thread_segment_size}) .add({a_lot_of, single_thread_segment_size + 1}) .add({a_lot_of, single_thread_segment_size * 2 - 1}) .add({a_lot_of, single_thread_segment_size * 2}) .add({a_lot_of, single_thread_segment_size * 2 + 1}) .add({a_bunch_of, small_segment_max_segment_size - 1}) .add({a_bunch_of, small_segment_max_segment_size}) .add({a_bunch_of, small_segment_max_segment_size + 1}) .add({a_bunch_of, medium_segment_max_segment_size - 1}) .add({a_bunch_of, medium_segment_max_segment_size}) .add({a_bunch_of, medium_segment_max_segment_size + 1}) .add({a_bunch_of, large_cached_segment_max_segment_size - 1}) .add({a_bunch_of, large_cached_segment_max_segment_size}) .add({a_bunch_of, large_cached_segment_max_segment_size + 1}) .add({a_few, large_cached_segment_max_segment_size * 2}) .add({a_few, large_cached_segment_max_segment_size * 3}) .add({a_few, large_cached_segment_max_segment_size * 5}) .template gen(sort_descending); InputTest(sort_descending, edge_cases); })); return cudaSuccess; } }; template struct EdgeTestDispatch { template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { // Edge case test is using an optimized testing approach which is // incompatible with duplicates. RandomTest is used for other types. return cudaSuccess; } }; template void EdgePatternsTest() { int ptx_version = 0; if (CubDebug(PtxVersion(ptx_version))) { return; } using MaxPolicyT = typename cub::DeviceSegmentedSortPolicy::MaxPolicy; using EdgeTestDispatchT = EdgeTestDispatch; EdgeTestDispatchT dispatch; MaxPolicyT::Invoke(ptx_version, dispatch); } template Input GenRandomInput(int max_items, int min_segments, int max_segments, bool descending) { int items_generated {}; const int segments_num = RandomValue(max_segments) + min_segments; thrust::host_vector segment_sizes; segment_sizes.reserve(segments_num); const int max_segment_size = 6000; for (int segment_id = 0; segment_id < segments_num; segment_id++) { const int segment_size_raw = RandomValue(max_segment_size); const int segment_size = segment_size_raw > 0 ? segment_size_raw : 0; if (segment_size + items_generated > max_items) { break; } items_generated += segment_size; segment_sizes.push_back(segment_size); } return Input{descending, segment_sizes}; } template void RandomTest(int min_segments, int max_segments) { const int max_items = 10000000; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { Input edge_cases = GenRandomInput(max_items, min_segments, max_segments, descending); InputTestRandom(edge_cases); } } template void Test() { for (int segment_size: { 1, 1024, 24 * 1024 }) { for (int segments: { 1, 1024 }) { TestSameSizeSegments(segment_size, segments); } } RandomTest(1 << 2, 1 << 8); RandomTest(1 << 9, 1 << 19); EdgePatternsTest(); } #if TEST_CDP == 1 template __global__ void LauncherKernel( void *tmp_storage, std::size_t temp_storage_bytes, const KeyT *in_keys, KeyT *out_keys, int num_items, int num_segments, const int *offsets) { CubDebug(cub::DeviceSegmentedSort::SortKeys(tmp_storage, temp_storage_bytes, in_keys, out_keys, num_items, num_segments, offsets, offsets + 1)); } template void TestDeviceSideLaunch(Input &input) { thrust::host_vector h_keys_output(input.get_num_items()); thrust::device_vector keys_output(input.get_num_items()); thrust::host_vector h_values_output(input.get_num_items()); thrust::device_vector values_output(input.get_num_items()); KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data()); thrust::host_vector h_keys(input.get_num_items()); thrust::host_vector h_values(input.get_num_items()); const thrust::host_vector &h_offsets = input.get_h_offsets(); for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { RandomizeInput(h_keys, h_values); input.get_d_keys_vec() = h_keys; input.get_d_values_vec() = h_values; const KeyT *d_input = input.get_d_keys(); std::size_t temp_storage_bytes{}; cub::DeviceSegmentedSort::SortKeys(nullptr, temp_storage_bytes, d_input, d_keys_output, input.get_num_items(), input.get_num_segments(), input.get_d_offsets(), input.get_d_offsets() + 1); thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); LauncherKernel<<<1, 1>>>( d_temp_storage, temp_storage_bytes, d_input, d_keys_output, input.get_num_items(), input.get_num_segments(), input.get_d_offsets()); CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(cudaPeekAtLastError()); HostReferenceSort(false, false, input.get_num_segments(), h_offsets, h_keys, h_values); h_keys_output = keys_output; const bool keys_ok = compare_two_outputs(h_offsets, h_keys, h_keys_output); AssertTrue(keys_ok); input.shuffle(); } } template void TestDeviceSideLaunch(int min_segments, int max_segments) { const int max_items = 10000000; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { Input edge_cases = GenRandomInput(max_items, min_segments, max_segments, descending); TestDeviceSideLaunch(edge_cases); } } template void TestDeviceSideLaunch() { TestDeviceSideLaunch(1 << 2, 1 << 8); TestDeviceSideLaunch(1 << 9, 1 << 19); } #endif // TEST_CDP void TestUnspecifiedRanges() { const std::size_t num_items = 1024 * 1024; const std::size_t max_segments = 42; const std::size_t avg_segment_size = num_items / max_segments; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { thrust::host_vector h_offsets_begin; thrust::host_vector h_offsets_end; h_offsets_begin.reserve(max_segments + 1); h_offsets_end.reserve(max_segments + 1); { int offset = 0; for (std::size_t sid = 0; sid < max_segments; sid++) { const int segment_size = RandomValue(static_cast(avg_segment_size)); const bool segment_is_utilized = RandomValue(100) > 60; if (segment_is_utilized) { h_offsets_begin.push_back(offset); h_offsets_end.push_back(offset + segment_size); } offset += segment_size; } if (h_offsets_begin.empty()) { h_offsets_begin.push_back(avg_segment_size); h_offsets_end.push_back(num_items); } } thrust::device_vector keys(num_items); thrust::device_vector values(num_items); thrust::sequence(keys.rbegin(), keys.rend()); thrust::sequence(values.rbegin(), values.rend()); thrust::device_vector d_offsets_begin = h_offsets_begin; thrust::device_vector d_offsets_end = h_offsets_end; thrust::device_vector expected_keys = keys; thrust::device_vector expected_values = values; const int num_segments = static_cast(h_offsets_begin.size()); for (int sid = 0; sid < num_segments; sid++) { const int segment_begin = h_offsets_begin[sid]; const int segment_end = h_offsets_end[sid]; thrust::sort_by_key(expected_keys.begin() + segment_begin, expected_keys.begin() + segment_end, expected_values.begin() + segment_begin); } thrust::device_vector result_keys = keys; thrust::device_vector result_values = values; { cub::DoubleBuffer keys_buffer( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data())); cub::DoubleBuffer values_buffer( thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data())); std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{}; CubDebugExit(cub::DeviceSegmentedSort::SortPairs( d_temp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()))); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedSort::SortPairs( d_temp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()))); for (int sid = 0; sid < num_segments; sid++) { const int segment_begin = h_offsets_begin[sid]; const int segment_end = h_offsets_end[sid]; if (keys_buffer.selector == 0) { thrust::copy( keys.begin() + segment_begin, keys.begin() + segment_end, result_keys.begin() + segment_begin); } if (values_buffer.selector == 0) { thrust::copy( values.begin() + segment_begin, values.begin() + segment_end, result_values.begin() + segment_begin); } } } AssertEquals(result_values, expected_values); AssertEquals(result_keys, expected_keys); thrust::sequence(keys.rbegin(), keys.rend()); thrust::sequence(values.rbegin(), values.rend()); result_keys = keys; result_values = values; { std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{}; CubDebugExit(cub::DeviceSegmentedSort::SortPairs( d_temp_storage, temp_storage_bytes, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data()), num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()))); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedSort::SortPairs( d_temp_storage, temp_storage_bytes, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data()), num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()))); } AssertEquals(result_values, expected_values); AssertEquals(result_keys, expected_keys); } } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); // %PARAM% TEST_CDP cdp 0:1 #if TEST_CDP == 0 TestZeroSegments(); TestEmptySegments(1 << 2); TestEmptySegments(1 << 22); #if TEST_HALF_T Test(); #endif #if TEST_BF_T Test(); #endif Test(); Test(); #elif TEST_CDP == 1 TestDeviceSideLaunch(); #endif // TEST_CDP TestUnspecifiedRanges(); return 0; } cub-2.0.1/test/test_device_select_if.cu000066400000000000000000001112471434614775400201330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceSelect::If and DevicePartition::If utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; float g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; // Selection functor type template struct LessThan { T compare; __host__ __device__ __forceinline__ LessThan(T compare) : compare(compare) {} __host__ __device__ __forceinline__ bool operator()(const T &a) const { return (a < compare); } }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceSelect entrypoints //--------------------------------------------------------------------- /** * Dispatch to select if entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, Int2Type /*is_flagged*/, Int2Type /*is_partition*/, int timing_timing_iterations, size_t* /*d_temp_storage_bytes*/, cudaError_t* /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT /*d_flags*/, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); } return error; } /** * Dispatch to partition if entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, Int2Type /*is_flagged*/, Int2Type /*is_partition*/, int timing_timing_iterations, size_t* /*d_temp_storage_bytes*/, cudaError_t* /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT /*d_flags*/, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); } return error; } /** * Dispatch to select flagged entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, Int2Type /*is_flagged*/, Int2Type /*partition*/, int timing_timing_iterations, size_t* /*d_temp_storage_bytes*/, cudaError_t* /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT /*select_op*/) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); } return error; } /** * Dispatch to partition flagged entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, Int2Type /*is_flagged*/, Int2Type /*partition*/, int timing_timing_iterations, size_t* /*d_temp_storage_bytes*/, cudaError_t* /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT /*select_op*/) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceSelect */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, IsFlaggedTag is_flagged, IsPartitionTag is_partition, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op) { *d_cdp_error = Dispatch(cub_backend, is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template cudaError_t Dispatch(Int2Type /*dispatch_to*/, IsFlaggedTag is_flagged, IsPartitionTag is_partition, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op) { // Invoke kernel to invoke device-side dispatch cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, Int2Type{}, is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( T* h_in, int num_items) { for (int i = 0; i < num_items; ++i) { // Initialize each item to a randomly selected value from [0..126] unsigned int value; RandomBits(value, 0, 0, 7); if (value == 127) value = 126; InitValue(INTEGER_SEED, h_in[i], value); } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve selection problem (and set corresponding flags) */ template < typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename T> int Solve( InputIteratorT h_in, SelectOpT select_op, T* h_reference, FlagIteratorT h_flags, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if ((h_flags[i] = select_op(h_in[i]))) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } /** * Test DeviceSelect for a given problem input */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename DeviceInputIteratorT, typename FlagT, typename SelectOpT, typename T> void Test( DeviceInputIteratorT d_in, FlagT* h_flags, SelectOpT select_op, T* h_reference, int num_selected, int num_items) { // Allocate device flags, output, and num-selected FlagT* d_flags = NULL; T* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate CDP device arrays size_t* d_temp_storage_bytes = NULL; cudaError_t* d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Copy flags and clear device output array CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items)); CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int))); // Run warmup/correctness iteration CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op)); // Check for correctness (and display results, if specified) int compare1 = (IS_PARTITION) ? CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) : CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s\n", compare1 ? "FAIL" : "PASS"); int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s\n", compare2 ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; int num_output_items = (IS_PARTITION) ? num_items : num_selected; int num_flag_items = (IS_FLAGGED) ? num_items : 0; size_t num_bytes = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items; float giga_bandwidth = float(num_bytes) / avg_millis / 1000.0f / 1000.0f; printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Cleanup if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare1 | compare2); } /** * Test on pointer type */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename T> void TestPointer( int num_items, float select_ratio) { typedef char FlagT; // Allocate host arrays T* h_in = new T[num_items]; FlagT* h_flags = new FlagT[num_items]; T* h_reference = new T[num_items]; // Initialize input Initialize(h_in, num_items); // Select a comparison value that is select_ratio through the space of [0,127] T compare; if (select_ratio <= 0.0) InitValue(INTEGER_SEED, compare, 0); // select none else if (select_ratio >= 1.0) InitValue(INTEGER_SEED, compare, 127); // select all else InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio))); LessThan select_op(compare); int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items); if (g_verbose) std::cout << "\nComparison item: " << compare << "\n"; printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n", (IS_PARTITION) ? "DevicePartition" : "DeviceSelect", (IS_FLAGGED) ? "Flagged" : "If", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Allocate problem device arrays T *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_in, h_flags, select_op, h_reference, num_selected, num_items); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_flags) delete[] h_flags; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /** * Test on iterator type */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename T> void TestIterator( int num_items, float select_ratio) { typedef char FlagT; // Allocate host arrays T* h_reference = new T[num_items]; FlagT* h_flags = new FlagT[num_items]; // Use counting iterator as the input CountingInputIterator h_in(0); // Select a comparison value that is select_ratio through the space of [0,127] T compare; if (select_ratio <= 0.0) InitValue(INTEGER_SEED, compare, 0); // select none else if (select_ratio >= 1.0) InitValue(INTEGER_SEED, compare, 127); // select all else InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio))); LessThan select_op(compare); int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items); if (g_verbose) std::cout << "\nComparison item: " << compare << "\n"; printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n", (IS_PARTITION) ? "DevicePartition" : "DeviceSelect", (IS_FLAGGED) ? "Flagged" : "If", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Run Test Test(h_in, h_flags, select_op, h_reference, num_selected, num_items); // Cleanup if (h_reference) delete[] h_reference; if (h_flags) delete[] h_flags; } /** * Test different selection ratios */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename T> void Test( int num_items) { for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f) { TestPointer(num_items, select_ratio); } } /** * Test (select vs. partition) and (flagged vs. functor) */ template < Backend BACKEND, typename T> void TestMethod( int num_items) { // Functor Test(num_items); Test(num_items); // Flagged Test(num_items); Test(num_items); } /** * Test different dispatch */ template < typename T> void TestOp( int num_items) { #if TEST_CDP == 0 TestMethod(num_items); #elif TEST_CDP == 1 TestMethod(num_items); #endif // TEST_CDP } /** * Test different input sizes */ template void Test( int num_items) { if (num_items < 0) { TestOp(0); TestOp(1); TestOp(100); TestOp(10000); TestOp(1000000); } else { TestOp(num_items); } } template struct pair_to_col_t { __host__ __device__ T0 operator()(const thrust::tuple &in) { return thrust::get<0>(in); } }; template struct select_t { __host__ __device__ bool operator()(const thrust::tuple &in) { return static_cast(thrust::get<0>(in)) > thrust::get<1>(in); } }; template void TestMixedOp(int num_items) { const T0 target_value = static_cast(42); thrust::device_vector col_a(num_items, target_value); thrust::device_vector col_b(num_items, static_cast(4.2)); thrust::device_vector result(num_items); auto in = thrust::make_zip_iterator(col_a.begin(), col_b.begin()); auto out = thrust::make_transform_output_iterator(result.begin(), pair_to_col_t{}); void *d_tmp_storage {}; std::size_t tmp_storage_size{}; cub::DeviceSelect::If( d_tmp_storage, tmp_storage_size, in, out, thrust::make_discard_iterator(), num_items, select_t{}); thrust::device_vector tmp_storage(tmp_storage_size); d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data()); cub::DeviceSelect::If( d_tmp_storage, tmp_storage_size, in, out, thrust::make_discard_iterator(), num_items, select_t{}); AssertEquals(num_items, thrust::count(result.begin(), result.end(), target_value)); } /** * Test different input sizes */ template void TestMixed(int num_items) { if (num_items < 0) { TestMixedOp(0); TestMixedOp(1); TestMixedOp(100); TestMixedOp(10000); TestMixedOp(1000000); } else { TestMixedOp(num_items); } } void TestFlagsNormalization() { const int num_items = 1024 * 1024; thrust::device_vector result(num_items); void *d_tmp_storage{}; std::size_t tmp_storage_size{}; CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, cub::CountingInputIterator(0), // in cub::CountingInputIterator(1), // flags thrust::raw_pointer_cast(result.data()), // out thrust::make_discard_iterator(), // num_out num_items)); thrust::device_vector tmp_storage(tmp_storage_size); d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data()); CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, cub::CountingInputIterator(0), // in cub::CountingInputIterator(1), // flags thrust::raw_pointer_cast(result.data()), // out thrust::make_discard_iterator(), // num_out num_items)); AssertTrue(thrust::equal(result.begin(), result.end(), thrust::make_counting_iterator(0))); } void TestFlagsAliasingInPartition() { int h_items[]{0, 1, 0, 2, 0, 3, 0, 4, 0, 5}; constexpr int num_items = sizeof(h_items) / sizeof(h_items[0]); int *d_in{}; int *d_out{}; CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, sizeof(h_items))); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_out, sizeof(h_items))); CubDebugExit( cudaMemcpy(d_in, h_items, sizeof(h_items), cudaMemcpyHostToDevice)); // alias flags and keys int *d_flags = d_in; void *d_tmp_storage{}; std::size_t tmp_storage_size{}; CubDebugExit( cub::DevicePartition::Flagged(d_tmp_storage, tmp_storage_size, d_in, d_flags, d_out, thrust::make_discard_iterator(), // num_out num_items)); thrust::device_vector tmp_storage(tmp_storage_size); d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data()); CubDebugExit( cub::DevicePartition::Flagged(d_tmp_storage, tmp_storage_size, d_in, d_flags, d_out, thrust::make_discard_iterator(), // num_out num_items)); AssertTrue(thrust::equal(thrust::device, d_out, d_out + num_items / 2, thrust::make_counting_iterator(1))); AssertEquals( thrust::count(thrust::device, d_out + num_items / 2, d_out + num_items, 0), num_items / 2); CubDebugExit(g_allocator.DeviceFree(d_out)); CubDebugExit(g_allocator.DeviceFree(d_in)); } struct Odd { __host__ __device__ bool operator()(int v) const { return v % 2; } }; void TestIfInPlace() { const int num_items = 4 * 1024 * 1024; const int num_iters = 42; thrust::device_vector num_out(1); thrust::device_vector data(num_items); thrust::device_vector reference(num_items); thrust::device_vector reference_out(1); thrust::sequence(data.begin(), data.end()); Odd op{}; int *d_num_out = thrust::raw_pointer_cast(num_out.data()); int *d_data = thrust::raw_pointer_cast(data.data()); int *d_reference = thrust::raw_pointer_cast(reference.data()); int *d_reference_out = thrust::raw_pointer_cast(reference_out.data()); void *d_tmp_storage{}; std::size_t tmp_storage_size{}; CubDebugExit( cub::DeviceSelect::If(d_tmp_storage, tmp_storage_size, d_data, d_num_out, num_items, op)); thrust::device_vector tmp_storage(tmp_storage_size); d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data()); thrust::default_random_engine g{}; for (int iter = 0; iter < num_iters; iter++) { thrust::shuffle(data.begin(), data.end(), g); CubDebugExit( cub::DeviceSelect::If(d_tmp_storage, tmp_storage_size, d_data, d_reference, d_reference_out, num_items, op)); CubDebugExit( cub::DeviceSelect::If(d_tmp_storage, tmp_storage_size, d_data, d_num_out, num_items, op)); AssertEquals(num_out, reference_out); const int num_selected = num_out[0]; const bool match_reference = thrust::equal(reference.begin(), reference.begin() + num_selected, data.begin()); AssertTrue(match_reference); } } void TestFlaggedInPlace() { const int num_items = 4 * 1024 * 1024; const int num_iters = 42; thrust::device_vector num_out(1); thrust::device_vector data(num_items); thrust::device_vector flags(num_items); int h_num_out{}; int *d_num_out = thrust::raw_pointer_cast(num_out.data()); int *d_data = thrust::raw_pointer_cast(data.data()); bool *d_flags = thrust::raw_pointer_cast(flags.data()); void *d_tmp_storage{}; std::size_t tmp_storage_size{}; CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, d_data, d_flags, d_num_out, num_items)); thrust::device_vector tmp_storage(tmp_storage_size); d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data()); thrust::default_random_engine g{}; for (int iter = 0; iter < num_iters; iter++) { const int num_selected = RandomValue(num_items); thrust::sequence(data.begin(), data.end()); thrust::fill(flags.begin(), flags.begin() + num_selected, true); thrust::fill(flags.begin() + num_selected, flags.end(), false); thrust::shuffle(flags.begin(), flags.end(), g); CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, d_data, d_flags, d_num_out, num_items)); cudaMemcpy(&h_num_out, d_num_out, sizeof(int), cudaMemcpyDeviceToHost); AssertEquals(num_selected, h_num_out); auto selection_perm_begin = thrust::make_permutation_iterator(flags.begin(), data.begin()); auto selection_perm_end = selection_perm_begin + num_selected; AssertEquals(num_selected, thrust::count(selection_perm_begin, selection_perm_end, true)); } } void TestFlaggedInPlaceWithAliasedFlags() { const int num_items = 1024 * 1024; const int num_iters = 42; thrust::device_vector num_out(1); thrust::device_vector data(num_items); thrust::device_vector reference(num_items); thrust::device_vector flags(num_items); int h_num_out{}; int *d_num_out = thrust::raw_pointer_cast(num_out.data()); int *d_data = thrust::raw_pointer_cast(data.data()); int *d_flags = d_data; // alias int *d_allocated_flags = thrust::raw_pointer_cast(data.data()); int *d_reference = thrust::raw_pointer_cast(reference.data()); void *d_tmp_storage{}; std::size_t tmp_storage_size{}; CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, d_data, d_flags, d_num_out, num_items)); thrust::device_vector tmp_storage(tmp_storage_size); d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data()); thrust::default_random_engine g{}; for (int iter = 0; iter < num_iters; iter++) { const int num_selected = RandomValue(num_items); thrust::sequence(data.begin(), data.begin() + num_selected, 1); thrust::fill(data.begin() + num_selected, data.end(), 0); thrust::shuffle(data.begin(), data.end(), g); CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, d_data, // in d_allocated_flags, d_reference, // out d_num_out, num_items)); CubDebugExit( cub::DeviceSelect::Flagged(d_tmp_storage, tmp_storage_size, d_data, d_flags, d_num_out, num_items)); cudaMemcpy(&h_num_out, d_num_out, sizeof(int), cudaMemcpyDeviceToHost); AssertEquals(num_selected, h_num_out); const bool match_reference = thrust::equal(reference.begin(), reference.begin() + num_selected, data.begin()); AssertTrue(match_reference); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; float select_ratio = 0.5; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("ratio", select_ratio); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--ratio=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); // %PARAM% TEST_CDP cdp 0:1 TestFlagsAliasingInPartition(); TestFlaggedInPlace(); TestFlaggedInPlaceWithAliasedFlags(); TestIfInPlace(); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); TestMixed(num_items); TestFlagsNormalization(); return 0; } cub-2.0.1/test/test_device_select_unique.cu000066400000000000000000000455051434614775400210460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceSelect::Unique utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; float g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceSelect entrypoints //--------------------------------------------------------------------- /** * Dispatch to unique entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceSelect */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items) { *d_cdp_error = Dispatch(cub_backend, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template cudaError_t Dispatch(Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items) { // Invoke kernel to invoke device-side dispatch cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, Int2Type{}, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( int entropy_reduction, T *h_in, int num_items, int max_segment) { unsigned int max_int = (unsigned int) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences for the current run int repeat; if (max_segment < 0) { repeat = num_items; } else if (max_segment < 2) { repeat = 1; } else { RandomBits(repeat, entropy_reduction); repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int)); repeat = CUB_MAX(1, repeat); } int j = i; while (j < CUB_MIN(i + repeat, num_items)) { InitValue(INTEGER_SEED, h_in[j], key); j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ template < typename InputIteratorT, typename T> int Solve( InputIteratorT h_in, T *h_reference, int num_items) { int num_selected = 0; if (num_items > 0) { h_reference[num_selected] = h_in[0]; num_selected++; } for (int i = 1; i < num_items; ++i) { if (h_in[i] != h_in[i - 1]) { h_reference[num_selected] = h_in[i]; num_selected++; } } return num_selected; } /** * Test DeviceSelect for a given problem input */ template < Backend BACKEND, typename DeviceInputIteratorT, typename T> void Test( DeviceInputIteratorT d_in, T *h_reference, int num_selected, int num_items) { // Allocate device output array and num selected T *d_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Clear device output array CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items)); CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int))); // Run warmup/correctness iteration CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare1 ? "FAIL" : "PASS"); int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare2 ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f; printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Cleanup if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare1 | compare2); } /** * Test DeviceSelect on pointer type */ template < Backend BACKEND, typename T> void TestPointer( int num_items, int entropy_reduction, int max_segment) { // Allocate host arrays T* h_in = new T[num_items]; T* h_reference = new T[num_items]; // Initialize problem and solution Initialize(entropy_reduction, h_in, num_items, max_segment); int num_selected = Solve(h_in, h_reference, num_items); printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(T).name(), (int) sizeof(T), entropy_reduction); fflush(stdout); // Allocate problem device arrays T *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_in, h_reference, num_selected, num_items); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /** * Test DeviceSelect on iterator type */ template < Backend BACKEND, typename T> void TestIterator( int num_items) { // Use a counting iterator as the input CountingInputIterator h_in(0); // Allocate host arrays T* h_reference = new T[num_items]; // Initialize problem and solution int num_selected = Solve(h_in, h_reference, num_items); printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Run Test Test(h_in, h_reference, num_selected, num_items); // Cleanup if (h_reference) delete[] h_reference; } /** * Test different gen modes */ template < Backend BACKEND, typename T> void Test( int num_items) { for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11) { TestPointer(num_items, 0, max_segment); TestPointer(num_items, 2, max_segment); TestPointer(num_items, 7, max_segment); } } /** * Test different dispatch */ template < typename T> void TestOp( int num_items) { #if TEST_CDP == 0 Test(num_items); #elif TEST_CDP == 1 Test(num_items); #endif // TEST_CDP } /** * Test different input sizes */ template void Test( int num_items) { if (num_items < 0) { TestOp(0); TestOp(1); TestOp(100); TestOp(10000); TestOp(1000000); } else { TestOp(num_items); } } template void TestIteratorOp(int num_items) { void *d_temp_storage{}; std::size_t temp_storage_size{}; thrust::device_vector num_selected(1); auto in = thrust::make_counting_iterator(static_cast(0)); auto out = thrust::make_discard_iterator(); CubDebugExit(cub::DeviceSelect::Unique(d_temp_storage, temp_storage_size, in, out, num_selected.begin(), num_items)); thrust::device_vector temp_storage(temp_storage_size); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSelect::Unique(d_temp_storage, temp_storage_size, in, out, num_selected.begin(), num_items)); AssertEquals(num_selected[0], num_items); } template void TestIterator(int num_items) { if (num_items < 0) { TestIteratorOp(0); TestIteratorOp(1); TestIteratorOp(100); TestIteratorOp(10000); TestIteratorOp(1000000); } else { TestIteratorOp(num_items); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; int entropy_reduction = 0; int maxseg = 1000; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("maxseg", maxseg); args.GetCmdLineArgument("entropy", entropy_reduction); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--maxseg=]" "[--entropy=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); // %PARAM% TEST_CDP cdp 0:1 // Test different input types Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); TestIterator(num_items); return 0; } cub-2.0.1/test/test_device_select_unique_by_key.cu000066400000000000000000000525771434614775400224170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceSelect::Unique utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; int g_repeat = 0; float g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceSelect entrypoints //--------------------------------------------------------------------- /** * Dispatch to unique entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); } return error; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceSelect */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items) { *d_cdp_error = Dispatch(cub_backend, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); *d_temp_storage_bytes = temp_storage_bytes; } /** * Dispatch to CDP kernel */ template cudaError_t Dispatch(Int2Type /*dispatch_to*/, int timing_timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items) { // Invoke kernel to invoke device-side dispatch cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, Int2Type{}, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } #endif // TEST_CDP //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( int entropy_reduction, T *h_in, int num_items, int max_segment) { unsigned int max_int = (unsigned int) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences for the current run int repeat; if (max_segment < 0) { repeat = num_items; } else if (max_segment < 2) { repeat = 1; } else { RandomBits(repeat, entropy_reduction); repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int)); repeat = CUB_MAX(1, repeat); } int j = i; while (j < CUB_MIN(i + repeat, num_items)) { InitValue(INTEGER_SEED, h_in[j], key); j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ template < typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyT, typename ValueT> int Solve( KeyInputIteratorT h_keys_in, ValueInputIteratorT h_values_in, KeyT *h_keys_reference, ValueT *h_values_reference, int num_items) { int num_selected = 0; if (num_items > 0) { h_keys_reference[num_selected] = h_keys_in[0]; h_values_reference[num_selected] = h_values_in[0]; num_selected++; } for (int i = 1; i < num_items; ++i) { if (h_keys_in[i] != h_keys_in[i - 1]) { h_keys_reference[num_selected] = h_keys_in[i]; h_values_reference[num_selected] = h_values_in[i]; num_selected++; } } return num_selected; } /** * Test DeviceSelect for a given problem input */ template < Backend BACKEND, typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyT, typename ValueT> void Test( KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyT *h_keys_reference, ValueT *h_values_reference, int num_selected, int num_items) { // Allocate device output array and num selected KeyT *d_keys_out = NULL; ValueT *d_values_out = NULL; int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Clear device output array CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items)); CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items)); CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int))); // Run warmup/correctness iteration CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare11 = CompareDeviceResults(h_keys_reference, d_keys_out, num_selected, true, g_verbose); int compare12 = CompareDeviceResults(h_values_reference, d_values_out, num_selected, true, g_verbose); int compare1 = compare11 && compare12; printf("\t Data %s ", compare1 ? "FAIL" : "PASS"); int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare2 ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = float((num_items + num_selected) * (sizeof(KeyT) + sizeof(ValueT))) / avg_millis / 1000.0f / 1000.0f; printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Cleanup if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out)); if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare1 | compare2); } /** * Test DeviceSelect on pointer type */ template < Backend BACKEND, typename KeyT, typename ValueT> void TestPointer( int num_items, int entropy_reduction, int max_segment) { // Allocate host arrays KeyT* h_keys_in = new KeyT[num_items]; ValueT* h_values_in = new ValueT[num_items]; KeyT* h_keys_reference = new KeyT[num_items]; ValueT* h_values_reference = new ValueT[num_items]; // Initialize problem and solution Initialize(entropy_reduction, h_keys_in, num_items, max_segment); Initialize(entropy_reduction, h_values_in, num_items, max_segment); int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(KeyT).name(), (int) sizeof(KeyT), entropy_reduction); fflush(stdout); // Allocate problem device arrays KeyT *d_keys_in = NULL; ValueT *d_values_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_keys_in, d_values_in, h_keys_reference, h_values_reference, num_selected, num_items); // Cleanup if (h_keys_in) delete[] h_keys_in; if (h_values_in) delete[] h_values_in; if (h_keys_reference) delete[] h_keys_reference; if (h_values_reference) delete[] h_values_reference; if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in)); } /** * Test DeviceSelect on iterator type */ template < Backend BACKEND, typename KeyT, typename ValueT> void TestIterator( int num_items) { // Use a counting iterator as the input CountingInputIterator h_keys_in(0); CountingInputIterator h_values_in(0); // Allocate host arrays KeyT* h_keys_reference = new KeyT[num_items]; ValueT* h_values_reference = new ValueT[num_items]; // Initialize problem and solution int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(KeyT).name(), (int) sizeof(ValueT)); fflush(stdout); // Run Test Test(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_selected, num_items); // Cleanup if (h_keys_reference) delete[] h_keys_reference; if (h_values_reference) delete[] h_values_reference; } /** * Test different gen modes */ template < Backend BACKEND, typename KeyT, typename ValueT> void Test( int num_items) { for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11) { TestPointer(num_items, 0, max_segment); TestPointer(num_items, 2, max_segment); TestPointer(num_items, 7, max_segment); } } /** * Test different dispatch */ template < typename KeyT, typename ValueT> void TestOp( int num_items) { #if TEST_CDP == 0 Test(num_items); #elif TEST_CDP == 1 Test(num_items); #endif // TEST_CDP } /** * Test different input sizes */ template < typename KeyT, typename ValueT> void Test( int num_items) { if (num_items < 0) { TestOp(0); TestOp(1); TestOp(100); TestOp(10000); TestOp(1000000); } else { TestOp(num_items); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; int entropy_reduction = 0; int maxseg = 1000; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("maxseg", maxseg); args.GetCmdLineArgument("entropy", entropy_reduction); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--maxseg=]" "[--entropy=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); // %PARAM% TEST_CDP cdp 0:1 // Test different input types Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); return 0; } cub-2.0.1/test/test_device_spmv.cu000066400000000000000000000453521434614775400171660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include "test_util.h" bool g_verbose = false; //============================================================================== // Casts char types to int for numeric printing template T print_cast(T val) { return val; } int print_cast(char val) { return static_cast(val); } int print_cast(signed char val) { return static_cast(val); } int print_cast(unsigned char val) { return static_cast(val); } //============================================================================== // Print a vector to out template void print_vector(std::ostream& out, const VectorT& vec) { bool first = true; for (const auto& val : vec) { if (!first) { out << ", "; } first = false; out << print_cast(val); } } //============================================================================== // Simple CSR matrix implementation. // HostStorage controls whether data is stored on the host or device. // Use the host_csr_matrix and device_csr_matrix aliases for code clarity. template struct csr_matrix { csr_matrix(int num_rows, int num_cols) : m_row_offsets(static_cast(num_rows + 1), 0) , m_num_rows(num_rows) , m_num_columns(num_cols) {} // host/device conversion constructor explicit csr_matrix(const csr_matrix& other) : m_values(other.m_values) , m_row_offsets(other.m_row_offsets) , m_column_indices(other.m_column_indices) , m_num_rows(other.m_num_rows) , m_num_columns(other.m_num_columns) , m_num_nonzeros(other.m_num_nonzeros) {} // Note that this must append to the values array. Finish filling each row // before adding to the next, and each row's columns must be added in order. // Must call `finalize` once all items are added. void append_value(int row, int col, ValueT value) { ++m_num_nonzeros; ++m_row_offsets[row]; m_column_indices.push_back(col); m_values.push_back(std::move(value)); } void finalize() { thrust::exclusive_scan(m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin()); AssertEquals(m_row_offsets.back(), m_num_nonzeros); } const ValueT* get_values() const { return thrust::raw_pointer_cast(m_values.data()); } const int* get_row_offsets() const { return thrust::raw_pointer_cast(m_row_offsets.data()); } int get_row_offset(int row) const { return m_row_offsets[row]; } int get_row_num_nonzero(int row) const { return m_row_offsets[row + 1] - m_row_offsets[row]; } const int* get_column_indices() const { return thrust::raw_pointer_cast(m_column_indices.data()); } int get_num_rows() const { return m_num_rows; } int get_num_columns() const { return m_num_columns; } int get_num_nonzeros() const { return m_num_nonzeros; } void print_internals(std::ostream& out) const { out << (HostStorage ? "host" : "device") << "_csr_matrix" << "(" << m_num_rows << ", " << m_num_columns << ")\n" << " - num_elems: " << (m_num_rows * m_num_columns) << "\n" << " - num_nonzero: " << m_num_nonzeros << "\n" << " - row_offsets:\n ["; print_vector(out, m_row_offsets); out << "]\n" << " - column_indices:\n ["; print_vector(out, m_column_indices); out << "]\n" << " - values:\n ["; print_vector(out, m_values); out << "]\n"; } void print_summary(std::ostream& out) const { const int num_elems = m_num_rows * m_num_columns; const float fill_ratio = num_elems == 0 ? 0.f : (static_cast(m_num_nonzeros) / static_cast(num_elems)); out << m_num_rows << "x" << m_num_columns << ", " << m_num_nonzeros << "/" << num_elems << " (" << fill_ratio << ")\n"; } friend class csr_matrix; private: template using vector_t = cub::detail::conditional_t, thrust::device_vector>; vector_t m_values; vector_t m_row_offsets; vector_t m_column_indices; int m_num_rows{0}; int m_num_columns{0}; int m_num_nonzeros{0}; }; //============================================================================== // Convenience aliases for host/device csr_matrix types. template using host_csr_matrix = csr_matrix; template using device_csr_matrix = csr_matrix; //============================================================================== // Compare two floats within a tolerance. // This mimics the approach used by Thrust's ASSERT_ALMOST_EQUAL checks. template struct fp_almost_equal_functor { __host__ __device__ bool operator()(ValueT v1, ValueT v2) const { constexpr double r_tol = 1e-3; constexpr double a_tol = 1e-2; const double limit = r_tol * (std::fabs(v1) + std::fabs(v2)) + a_tol; return std::fabs(v1 - v2) <= limit; } }; //============================================================================== // Compare the reference and cub output vectors. // Use fuzzy check for floating point values. template bool compare_results(std::true_type /* is_fp */, const thrust::host_vector& h_vec1, const thrust::device_vector& d_vec2) { thrust::device_vector d_vec1(h_vec1); auto err = thrust::mismatch(d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin(), fp_almost_equal_functor{}); if (err.first == d_vec1.cend() || err.second == d_vec2.cend()) { return true; } else { thrust::host_vector h_vec2(d_vec2); const auto idx = thrust::distance(d_vec1.cbegin(), err.first); std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs " << print_cast(ValueT{h_vec2[idx]}) << std::endl; return false; } }; template bool compare_results(std::false_type /* is_fp */, const thrust::host_vector& h_vec1, const thrust::device_vector& d_vec2) { thrust::device_vector d_vec1(h_vec1); auto err = thrust::mismatch(d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin()); if (err.first == d_vec1.cend() || err.second == d_vec2.cend()) { return true; } else { thrust::host_vector h_vec2(d_vec2); const auto idx = thrust::distance(d_vec1.cbegin(), err.first); std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs " << print_cast(ValueT{h_vec2[idx]}) << std::endl; return false; } } //============================================================================== // Generate a random host_csr_matrix with the specified dimensions. // target_fill_ratio is the target fraction of non-zero elements (may be more // or less in the output). template host_csr_matrix make_random_csr_matrix(int num_rows, int num_cols, float target_fill_ratio) { host_csr_matrix mat{num_rows, num_cols}; for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const bool is_non_zero = RandomValue(1.f) < target_fill_ratio; if (!is_non_zero) { continue; } if (std::is_floating_point::value) { // Keep fp numbers somewhat small, from -100 -> 100; otherwise we run // into issues with nans/infs ValueT value = (RandomValue(static_cast(200)) - static_cast(100)); mat.append_value(row, col, value); } else { ValueT value{}; InitValue(RANDOM, value); mat.append_value(row, col, value); } } } mat.finalize(); const int num_elements = num_rows * num_cols; const float actual_fill_ratio = static_cast(mat.get_num_nonzeros()) / static_cast(num_elements); if (g_verbose) { printf("Created host_csr_matrix<%s>(%d, %d)\n" " - NumElements: %d\n" " - NumNonZero: %d\n" " - Target fill: %0.2f%%\n" " - Actual fill: %0.2f%%\n", typeid(ValueT).name(), num_rows, num_cols, num_elements, mat.get_num_nonzeros(), target_fill_ratio, actual_fill_ratio); } return mat; } //============================================================================== // Fill a vector with random values. template thrust::host_vector make_random_vector(int len) { thrust::host_vector vec(len); for (auto& val : vec) { if (std::is_floating_point::value) { // Keep fp numbers somewhat small; otherwise we run into issues with // nans/infs val = RandomValue(static_cast(200)) - static_cast(100); } else { InitValue(RANDOM, val); } } return vec; } //============================================================================== // Serial y = Ax computation template void compute_reference_solution(const host_csr_matrix& a, const thrust::host_vector& x, thrust::host_vector& y) { if (a.get_num_rows() == 0 || a.get_num_columns() == 0) { return; } for (int row = 0; row < a.get_num_rows(); ++row) { const int row_offset = a.get_row_offset(row); const int row_length = a.get_row_num_nonzero(row); const int* cols = a.get_column_indices() + row_offset; const int* cols_end = cols + row_length; const ValueT* values = a.get_values() + row_offset; ValueT accum{}; while (cols < cols_end) { accum += (*values++) * x[*cols++]; } y[row] = accum; } } //============================================================================== // cub::DeviceSpmv::CsrMV y = Ax computation template void compute_cub_solution(const device_csr_matrix& a, const thrust::device_vector& x, thrust::device_vector& y) { thrust::device_vector temp_storage; std::size_t temp_storage_bytes{}; auto err = cub::DeviceSpmv::CsrMV(nullptr, temp_storage_bytes, a.get_values(), a.get_row_offsets(), a.get_column_indices(), thrust::raw_pointer_cast(x.data()), thrust::raw_pointer_cast(y.data()), a.get_num_rows(), a.get_num_columns(), a.get_num_nonzeros()); CubDebugExit(err); temp_storage.resize(temp_storage_bytes); err = cub::DeviceSpmv::CsrMV(thrust::raw_pointer_cast(temp_storage.data()), temp_storage_bytes, a.get_values(), a.get_row_offsets(), a.get_column_indices(), thrust::raw_pointer_cast(x.data()), thrust::raw_pointer_cast(y.data()), a.get_num_rows(), a.get_num_columns(), a.get_num_nonzeros()); CubDebugExit(err); } //============================================================================== // Compute y = Ax twice, one reference and one cub::DeviceSpmv, and compare the // results. template void test_spmv(const host_csr_matrix& h_a, const thrust::host_vector& h_x) { if (g_verbose) { std::cout << "Testing cub::DeviceSpmv on inputs:\n"; h_a.print_internals(std::cout); std::cout << "x vector:\n ["; print_vector(std::cout, h_x); std::cout << "]" << std::endl; } else { h_a.print_summary(std::cout); } const device_csr_matrix d_a(h_a); const thrust::device_vector d_x(h_x); thrust::host_vector h_y(h_a.get_num_rows()); thrust::device_vector d_y(d_a.get_num_rows()); compute_reference_solution(h_a, h_x, h_y); compute_cub_solution(d_a, d_x, d_y); if (g_verbose) { std::cout << "reference output:\n ["; print_vector(std::cout, h_y); std::cout << "]\n"; thrust::host_vector tmp_y(d_y); std::cout << "cub::DeviceSpmv output:\n ["; print_vector(std::cout, tmp_y); std::cout << "]" << std::endl; } constexpr auto is_fp = std::is_floating_point{}; AssertTrue(compare_results(is_fp, h_y, d_y)); } //============================================================================== // Test example from cub::DeviceSpmv documentation template void test_doc_example() { std::cout << "\n\ntest_doc_example<" << typeid(ValueT).name() << ">()" << std::endl; host_csr_matrix h_a(9, 9); h_a.append_value(0, 1, ValueT{1}); h_a.append_value(0, 3, ValueT{1}); h_a.append_value(1, 0, ValueT{1}); h_a.append_value(1, 2, ValueT{1}); h_a.append_value(1, 4, ValueT{1}); h_a.append_value(2, 1, ValueT{1}); h_a.append_value(2, 5, ValueT{1}); h_a.append_value(3, 0, ValueT{1}); h_a.append_value(3, 4, ValueT{1}); h_a.append_value(3, 6, ValueT{1}); h_a.append_value(4, 1, ValueT{1}); h_a.append_value(4, 3, ValueT{1}); h_a.append_value(4, 5, ValueT{1}); h_a.append_value(4, 7, ValueT{1}); h_a.append_value(5, 2, ValueT{1}); h_a.append_value(5, 4, ValueT{1}); h_a.append_value(5, 8, ValueT{1}); h_a.append_value(6, 3, ValueT{1}); h_a.append_value(6, 7, ValueT{1}); h_a.append_value(7, 4, ValueT{1}); h_a.append_value(7, 6, ValueT{1}); h_a.append_value(7, 8, ValueT{1}); h_a.append_value(8, 5, ValueT{1}); h_a.append_value(8, 7, ValueT{1}); h_a.finalize(); thrust::host_vector h_x(9, ValueT{1}); test_spmv(h_a, h_x); } //============================================================================== // Generate and test a random SpMV operation with the given parameters. template void test_random(int rows, int cols, float target_fill_ratio) { std::cout << "\n\ntest_random<" << typeid(ValueT).name() << ">(" << rows << ", " << cols << ", " << target_fill_ratio << ")" << std::endl; host_csr_matrix h_a = make_random_csr_matrix(rows, cols, target_fill_ratio); thrust::host_vector h_x = make_random_vector(cols); test_spmv(h_a, h_x); } //============================================================================== // Dispatch many random SpMV tests over a variety of parameters. template void test_random() { test_random(0, 0, 1.f); test_random(0, 1, 1.f); test_random(1, 0, 1.f); const int dim_min = 1; const int dim_max = 10000; const int max_num_elems = 100000; const float ratio_min = 0.f; const float ratio_max = 1.1f; // a lil over to account for fp errors const float ratio_step = 0.3334f; for (int rows = dim_min; rows < dim_max; rows <<= 1) { for (int cols = dim_min; cols < dim_max; cols <<= 1) { if (rows * cols >= max_num_elems) { continue; } for (float ratio = ratio_min; ratio < ratio_max; ratio += ratio_step) { test_random(rows, cols, ratio); // Test nearby non-power-of-two dims: test_random(rows + 97, cols + 83, ratio); } } } } //============================================================================== // Dispatch many SpMV tests for a given ValueT. template void test_type() { test_doc_example(); test_random(); } //============================================================================== // Dispatch many SpMV tests over a variety of types. void test_types() { test_type(); test_type(); test_type(); test_type(); test_type(); } int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] verbose" "\n", argv[0]); exit(0); } CubDebugExit(args.DeviceInit()); test_types(); } cub-2.0.1/test/test_device_three_way_partition.cu000066400000000000000000000464601434614775400222620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include using namespace cub; template struct LessThan { T compare; explicit __host__ LessThan(T compare) : compare(compare) {} __device__ bool operator()(const T &a) const { return a < compare; } }; template struct EqualTo { T compare; explicit __host__ EqualTo(T compare) : compare(compare) {} __device__ bool operator()(const T &a) const { return a == compare; } }; template struct GreaterOrEqual { T compare; explicit __host__ GreaterOrEqual(T compare) : compare(compare) {} __device__ bool operator()(const T &a) const { return a >= compare; } }; template void TestEmpty() { int num_items = 0; T *in {}; T *d_first_part_out {}; T *d_second_part_out {}; T *d_unselected_out {}; T *d_num_selected_out {}; LessThan le(T{0}); GreaterOrEqual ge(T{1}); std::size_t temp_storage_size {}; CubDebugExit(cub::DevicePartition::If(nullptr, temp_storage_size, in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, le, ge)); thrust::device_vector temp_storage(temp_storage_size); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DevicePartition::If(d_temp_storage, temp_storage_size, in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, le, ge)); } template class ThreeWayPartitionResult { public: ThreeWayPartitionResult() = delete; ThreeWayPartitionResult(int num_items) : first_part(num_items) , second_part(num_items) , unselected(num_items) {} thrust::device_vector first_part; thrust::device_vector second_part; thrust::device_vector unselected; int num_items_in_first_part {}; int num_items_in_second_part {}; int num_unselected_items {}; bool operator!=(const ThreeWayPartitionResult &other) { return std::tie(num_items_in_first_part, num_items_in_second_part, num_unselected_items, first_part, second_part, unselected) != std::tie(other.num_items_in_first_part, other.num_items_in_second_part, other.num_unselected_items, other.first_part, other.second_part, other.unselected); } }; template < typename FirstPartSelectionOp, typename SecondPartSelectionOp, typename T> ThreeWayPartitionResult CUBPartition( FirstPartSelectionOp first_selector, SecondPartSelectionOp second_selector, thrust::device_vector &in) { const int num_items = static_cast(in.size()); ThreeWayPartitionResult result(num_items); T *d_in = thrust::raw_pointer_cast(in.data()); T *d_first_part_out = thrust::raw_pointer_cast(result.first_part.data()); T *d_second_part_out = thrust::raw_pointer_cast(result.second_part.data()); T *d_unselected_out = thrust::raw_pointer_cast(result.unselected.data()); thrust::device_vector num_selected_out(2); int *d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); std::size_t temp_storage_size {}; CubDebugExit(cub::DevicePartition::If(nullptr, temp_storage_size, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, first_selector, second_selector)); thrust::device_vector temp_storage(temp_storage_size); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DevicePartition::If(d_temp_storage, temp_storage_size, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, first_selector, second_selector)); thrust::host_vector h_num_selected_out(num_selected_out); result.num_items_in_first_part = h_num_selected_out[0]; result.num_items_in_second_part = h_num_selected_out[1]; result.num_unselected_items = num_items - h_num_selected_out[0] - h_num_selected_out[1]; return result; } template < typename FirstPartSelectionOp, typename SecondPartSelectionOp, typename T> ThreeWayPartitionResult ThrustPartition( FirstPartSelectionOp first_selector, SecondPartSelectionOp second_selector, thrust::device_vector &in) { const int num_items = static_cast(in.size()); ThreeWayPartitionResult result(num_items); thrust::device_vector intermediate_result(num_items); auto intermediate_iterators = thrust::partition_copy(in.begin(), in.end(), result.first_part.begin(), intermediate_result.begin(), first_selector); result.num_items_in_first_part = static_cast( thrust::distance(result.first_part.begin(), intermediate_iterators.first)); auto final_iterators = thrust::partition_copy( intermediate_result.begin(), intermediate_result.begin() + (num_items - result.num_items_in_first_part), result.second_part.begin(), result.unselected.begin(), second_selector); result.num_items_in_second_part = static_cast( thrust::distance(result.second_part.begin(), final_iterators.first)); result.num_unselected_items = static_cast( thrust::distance(result.unselected.begin(), final_iterators.second)); return result; } template void TestEmptyFirstPart(int num_items) { thrust::device_vector in(num_items); thrust::sequence(in.begin(), in.end()); T first_unselected_val = T{0}; T first_val_of_second_part = static_cast(num_items / 2); LessThan le(first_unselected_val); GreaterOrEqual ge(first_val_of_second_part); auto cub_result = CUBPartition(le, ge, in); auto thrust_result = ThrustPartition(le, ge, in); AssertEquals(cub_result, thrust_result); AssertEquals(cub_result.num_items_in_first_part, 0); } template void TestEmptySecondPart(int num_items) { thrust::device_vector in(num_items); thrust::sequence(in.begin(), in.end()); T first_unselected_val = static_cast(num_items / 2); T first_val_of_second_part = T{0}; // empty set for unsigned types GreaterOrEqual ge(first_unselected_val); LessThan le(first_val_of_second_part); auto cub_result = CUBPartition(ge, le, in); auto thrust_result = ThrustPartition(ge, le, in); AssertEquals(cub_result, thrust_result); AssertEquals(cub_result.num_items_in_second_part, 0); } template void TestEmptyUnselectedPart(int num_items) { thrust::device_vector in(num_items); thrust::sequence(in.begin(), in.end()); T first_unselected_val = static_cast(num_items / 2); LessThan le(first_unselected_val); GreaterOrEqual ge(first_unselected_val); auto cub_result = CUBPartition(le, ge, in); auto thrust_result = ThrustPartition(le, ge, in); AssertEquals(cub_result, thrust_result); AssertEquals(cub_result.num_unselected_items, 0); } template void TestUnselectedOnly(int num_items) { thrust::device_vector in(num_items); thrust::sequence(in.begin(), in.end()); T first_val_of_second_part = T{0}; // empty set for unsigned types LessThan le(first_val_of_second_part); auto cub_result = CUBPartition(le, le, in); auto thrust_result = ThrustPartition(le, le, in); AssertEquals(cub_result, thrust_result); AssertEquals(cub_result.num_unselected_items, num_items); AssertEquals(cub_result.num_items_in_first_part, 0); AssertEquals(cub_result.num_items_in_second_part, 0); } template struct Pair { Key key; Value value; __host__ __device__ Pair() : key(Key{}) , value(Value{}) {} __host__ __device__ Pair(Key key) : key(key) , value(Value{}) {} __host__ __device__ Pair(Key key, Value value) : key(key) , value(value) {} __host__ __device__ bool operator<(const Pair &b) const { return key < b.key; } __host__ __device__ bool operator>=(const Pair &b) const { return key >= b.key; } }; template __device__ __host__ bool operator==( const Pair &lhs, const Pair &rhs) { return lhs.key == rhs.key && lhs.value == lhs.value; } template struct CountToPair { template __device__ __host__ Pairoperator()(OffsetT id) { return Pair(static_cast(id), id); } }; template void TestStability(int num_items) { using T = Pair; thrust::device_vector in(num_items); thrust::tabulate(in.begin(), in.end(), CountToPair{}); T first_unselected_val = static_cast(num_items / 3); T first_val_of_second_part = static_cast(2 * num_items / 3); LessThan le(first_unselected_val); GreaterOrEqual ge(first_val_of_second_part); auto cub_result = CUBPartition(le, ge, in); auto thrust_result = ThrustPartition(le, ge, in); AssertEquals(cub_result, thrust_result); } template void TestReverseIterator(int num_items) { int num_items_in_first_part = num_items / 3; int num_unselected_items = 2 * num_items / 3; T first_part_val {0}; T second_part_val {1}; T unselected_part_val {2}; thrust::device_vector in(num_items, second_part_val); thrust::fill_n(in.begin(), num_items_in_first_part, first_part_val); thrust::fill_n(in.begin() + num_items_in_first_part, num_unselected_items, unselected_part_val); thrust::shuffle(in.begin(), in.end(), thrust::default_random_engine{}); thrust::device_vector first_and_unselected_part(num_items); EqualTo first_selector{first_part_val}; EqualTo second_selector{second_part_val}; thrust::device_vector num_selected_out(2); std::size_t temp_storage_size {}; CubDebugExit(cub::DevicePartition::If(nullptr, temp_storage_size, in.cbegin(), first_and_unselected_part.begin(), thrust::make_discard_iterator(), first_and_unselected_part.rbegin(), num_selected_out.begin(), num_items, first_selector, second_selector)); thrust::device_vector temp_storage(temp_storage_size); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DevicePartition::If(d_temp_storage, temp_storage_size, in.cbegin(), first_and_unselected_part.begin(), thrust::make_discard_iterator(), first_and_unselected_part.rbegin(), num_selected_out.begin(), num_items, first_selector, second_selector)); thrust::device_vector h_num_selected_out(num_selected_out); AssertEquals(h_num_selected_out[0], num_items_in_first_part); AssertEquals(thrust::count(first_and_unselected_part.rbegin(), first_and_unselected_part.rbegin() + num_unselected_items, unselected_part_val), num_unselected_items); AssertEquals(thrust::count(first_and_unselected_part.begin(), first_and_unselected_part.begin() + num_items_in_first_part, first_part_val), num_items_in_first_part); } template void TestSingleOutput(int num_items) { int num_items_in_first_part = num_items / 3; int num_unselected_items = 2 * num_items / 3; int num_items_in_second_part = num_items - num_items_in_first_part - num_unselected_items; T first_part_val{0}; T second_part_val{1}; T unselected_part_val{2}; thrust::device_vector in(num_items, second_part_val); thrust::fill_n(in.begin(), num_items_in_first_part, first_part_val); thrust::fill_n(in.begin() + num_items_in_first_part, num_unselected_items, unselected_part_val); thrust::shuffle(in.begin(), in.end(), thrust::default_random_engine{}); thrust::device_vector output(num_items); EqualTo first_selector{first_part_val}; EqualTo second_selector{second_part_val}; thrust::device_vector num_selected_out(2); std::size_t temp_storage_size{}; CubDebugExit(cub::DevicePartition::If(nullptr, temp_storage_size, in.cbegin(), output.begin(), output.begin() + num_items_in_first_part, output.rbegin(), num_selected_out.begin(), num_items, first_selector, second_selector)); thrust::device_vector temp_storage(temp_storage_size); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DevicePartition::If(d_temp_storage, temp_storage_size, in.cbegin(), output.begin(), output.begin() + num_items_in_first_part, output.rbegin(), num_selected_out.begin(), num_items, first_selector, second_selector)); thrust::device_vector h_num_selected_out(num_selected_out); AssertEquals(h_num_selected_out[0], num_items_in_first_part); AssertEquals(h_num_selected_out[1], num_items_in_second_part); AssertEquals(thrust::count(output.rbegin(), output.rbegin() + num_unselected_items, unselected_part_val), num_unselected_items); AssertEquals(thrust::count(output.begin(), output.begin() + num_items_in_first_part, first_part_val), num_items_in_first_part); AssertEquals(thrust::count(output.begin() + num_items_in_first_part, output.begin() + num_items_in_first_part + num_items_in_second_part, second_part_val), num_items_in_second_part); } template void TestNumItemsDependent(int num_items) { TestStability(num_items); TestEmptyFirstPart(num_items); TestEmptySecondPart(num_items); TestEmptyUnselectedPart(num_items); TestUnselectedOnly(num_items); TestReverseIterator(num_items); TestSingleOutput(num_items); } template void TestNumItemsDependent() { for (int num_items = 1; num_items < 1000000; num_items <<= 2) { TestNumItemsDependent(num_items); TestNumItemsDependent(num_items + 31); } } template void Test() { TestEmpty(); TestNumItemsDependent(); } int main(int argc, char **argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); Test(); Test(); Test(); Test(); return 0; } cub-2.0.1/test/test_grid_barrier.cu000066400000000000000000000120201434614775400172770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test evaluation for software global barrier throughput ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * Kernel that iterates through the specified number of software global barriers */ __global__ void Kernel( GridBarrier global_barrier, int iterations) { for (int i = 0; i < iterations; i++) { global_barrier.Sync(); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { cudaError_t retval = cudaSuccess; // Defaults int iterations = 10000; int block_size = 128; int grid_size = -1; // Initialize command line CommandLineArgs args(argc, argv); // Get args args.GetCmdLineArgument("i", iterations); args.GetCmdLineArgument("grid-size", grid_size); args.GetCmdLineArgument("block-size", block_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=]" "[--i=]" "[--grid-size]" "[--block-size]" "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Get device ordinal int device_ordinal; CubDebugExit(cudaGetDevice(&device_ordinal)); // Get device SM version int sm_version = 0; CubDebugExit(SmVersion(sm_version, device_ordinal)); // Get SM properties int sm_count, max_block_threads, max_sm_occupancy; CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal)); CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel, 32)); // Compute grid size and occupancy int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy); if (grid_size == -1) { grid_size = occupancy * sm_count; } else { occupancy = grid_size / sm_count; } printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n", grid_size, block_size, occupancy); fflush(stdout); // Init global barrier GridBarrierLifetime global_barrier; global_barrier.Setup(grid_size); // Time kernel GpuTimer gpu_timer; gpu_timer.Start(); Kernel<<>>(global_barrier, iterations); gpu_timer.Stop(); retval = CubDebug(cudaDeviceSynchronize()); // Output timing results float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations); printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n", iterations, gpu_timer.ElapsedMillis(), avg_elapsed); return retval; } cub-2.0.1/test/test_iterator.cu000066400000000000000000000406361434614775400165130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of iterator utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; template struct TransformOp { // Increment transform __host__ __device__ __forceinline__ T operator()(T input) const { T addend; InitValue(INTEGER_SEED, addend, 1); return input + addend; } }; struct SelectOp { template __host__ __device__ __forceinline__ bool operator()(T input) { return true; } }; //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * Test random access input iterator */ template < typename InputIteratorT, typename T> __global__ void Kernel( InputIteratorT d_in, T *d_out, InputIteratorT *d_itrs) { d_out[0] = *d_in; // Value at offset 0 d_out[1] = d_in[100]; // Value at offset 100 d_out[2] = *(d_in + 1000); // Value at offset 1000 d_out[3] = *(d_in + 10000); // Value at offset 10000 d_in++; d_out[4] = d_in[0]; // Value at offset 1 d_in += 20; d_out[5] = d_in[0]; // Value at offset 21 d_itrs[0] = d_in; // Iterator at offset 21 d_in -= 10; d_out[6] = d_in[0]; // Value at offset 11; d_in -= 11; d_out[7] = d_in[0]; // Value at offset 0 d_itrs[1] = d_in; // Iterator at offset 0 } //--------------------------------------------------------------------- // Host testing subroutines //--------------------------------------------------------------------- /** * Run iterator test on device */ template < typename InputIteratorT, typename T, int TEST_VALUES> void Test( InputIteratorT d_in, T (&h_reference)[TEST_VALUES]) { // Allocate device arrays T *d_out = NULL; InputIteratorT *d_itrs = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * TEST_VALUES)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs, sizeof(InputIteratorT) * 2)); int compare; // Run unguarded kernel Kernel<<<1, 1>>>(d_in, d_out, d_itrs); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check results compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose); printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check iterator at offset 21 InputIteratorT h_itr = d_in + 21; compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose); printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check iterator at offset 0 compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose); printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs)); } /** * Test constant iterator */ template void TestConstant(T base) { printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout); // // Test iterator manipulation in kernel // T h_reference[8] = {base, base, base, base, base, base, base, base}; ConstantInputIterator d_itr(base); Test(d_itr, h_reference); } /** * Test counting iterator */ template void TestCounting(T base) { printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout); // // Test iterator manipulation in kernel // // Initialize reference data T h_reference[8]; h_reference[0] = static_cast(base + 0); // Value at offset 0 h_reference[1] = static_cast(base + 100); // Value at offset 100 h_reference[2] = static_cast(base + 1000); // Value at offset 1000 h_reference[3] = static_cast(base + 10000); // Value at offset 10000 h_reference[4] = static_cast(base + 1); // Value at offset 1 h_reference[5] = static_cast(base + 21); // Value at offset 21 h_reference[6] = static_cast(base + 11); // Value at offset 11 h_reference[7] = static_cast(base + 0); // Value at offset 0; CountingInputIterator d_itr(base); Test(d_itr, h_reference); } /** * Test modified iterator */ template void TestModified() { printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { RandomBits(h_data[i]); } // Allocate device arrays T *d_data = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); // Initialize reference data T h_reference[8]; h_reference[0] = h_data[0]; // Value at offset 0 h_reference[1] = h_data[100]; // Value at offset 100 h_reference[2] = h_data[1000]; // Value at offset 1000 h_reference[3] = h_data[10000]; // Value at offset 10000 h_reference[4] = h_data[1]; // Value at offset 1 h_reference[5] = h_data[21]; // Value at offset 21 h_reference[6] = h_data[11]; // Value at offset 11 h_reference[7] = h_data[0]; // Value at offset 0; Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } /** * Test transform iterator */ template void TestTransform() { printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { InitValue(INTEGER_SEED, h_data[i], i); } // Allocate device arrays T *d_data = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); TransformOp op; // Initialize reference data T h_reference[8]; h_reference[0] = op(h_data[0]); // Value at offset 0 h_reference[1] = op(h_data[100]); // Value at offset 100 h_reference[2] = op(h_data[1000]); // Value at offset 1000 h_reference[3] = op(h_data[10000]); // Value at offset 10000 h_reference[4] = op(h_data[1]); // Value at offset 1 h_reference[5] = op(h_data[21]); // Value at offset 21 h_reference[6] = op(h_data[11]); // Value at offset 11 h_reference[7] = op(h_data[0]); // Value at offset 0; TransformInputIterator, CastT*> d_itr((CastT*) d_data, op); Test(d_itr, h_reference); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } /** * Test tex-obj texture iterator */ template void TestTexObj() { printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // const unsigned int TEST_VALUES = 11000; const unsigned int DUMMY_OFFSET = 500; const unsigned int DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; T *h_data = new T[TEST_VALUES]; for (unsigned int i = 0; i < TEST_VALUES; ++i) { RandomBits(h_data[i]); } // Allocate device arrays T *d_data = NULL; T *d_dummy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice)); // Initialize reference data T h_reference[8]; h_reference[0] = h_data[0]; // Value at offset 0 h_reference[1] = h_data[100]; // Value at offset 100 h_reference[2] = h_data[1000]; // Value at offset 1000 h_reference[3] = h_data[10000]; // Value at offset 10000 h_reference[4] = h_data[1]; // Value at offset 1 h_reference[5] = h_data[21]; // Value at offset 21 h_reference[6] = h_data[11]; // Value at offset 11 h_reference[7] = h_data[0]; // Value at offset 0; // Create and bind obj-based test iterator TexObjInputIterator d_obj_itr; CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES)); Test(d_obj_itr, h_reference); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy)); } /** * Test texture transform iterator */ template void TestTexTransform() { printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { InitValue(INTEGER_SEED, h_data[i], i); } // Allocate device arrays T *d_data = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); TransformOp op; // Initialize reference data T h_reference[8]; h_reference[0] = op(h_data[0]); // Value at offset 0 h_reference[1] = op(h_data[100]); // Value at offset 100 h_reference[2] = op(h_data[1000]); // Value at offset 1000 h_reference[3] = op(h_data[10000]); // Value at offset 10000 h_reference[4] = op(h_data[1]); // Value at offset 1 h_reference[5] = op(h_data[21]); // Value at offset 21 h_reference[6] = op(h_data[11]); // Value at offset 11 h_reference[7] = op(h_data[0]); // Value at offset 0; // Create and bind texture iterator typedef TexObjInputIterator TextureIterator; TextureIterator d_tex_itr; CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES)); // Create transform iterator TransformInputIterator, TextureIterator> xform_itr(d_tex_itr, op); Test(xform_itr, h_reference); CubDebugExit(d_tex_itr.UnbindTexture()); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } /** * Run non-integer tests */ template void Test(Int2Type /* is_integer */) { TestModified(); TestTransform(); TestTexObj(); TestTexTransform(); } /** * Run integer tests */ template void Test(Int2Type /* is_integer */) { TestConstant(0); TestConstant(99); TestCounting(0); TestCounting(99); // Run non-integer tests Test(Int2Type()); } /** * Run tests */ template void Test() { enum { IS_INTEGER = (Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER) }; // Test non-const type Test(Int2Type()); // Test non-const type Test(Int2Type()); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Get ptx version int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); // Evaluate different data types Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); printf("\nTest complete\n"); fflush(stdout); return 0; } cub-2.0.1/test/test_iterator_deprecated.cu000066400000000000000000000217161434614775400206710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of iterator utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR // This file tests deprecated CUB APIs. Silence deprecation warnings: #define CUB_IGNORE_DEPRECATED_API #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * Test random access input iterator */ template < typename InputIteratorT, typename T> __global__ void Kernel( InputIteratorT d_in, T *d_out, InputIteratorT *d_itrs) { d_out[0] = *d_in; // Value at offset 0 d_out[1] = d_in[100]; // Value at offset 100 d_out[2] = *(d_in + 1000); // Value at offset 1000 d_out[3] = *(d_in + 10000); // Value at offset 10000 d_in++; d_out[4] = d_in[0]; // Value at offset 1 d_in += 20; d_out[5] = d_in[0]; // Value at offset 21 d_itrs[0] = d_in; // Iterator at offset 21 d_in -= 10; d_out[6] = d_in[0]; // Value at offset 11; d_in -= 11; d_out[7] = d_in[0]; // Value at offset 0 d_itrs[1] = d_in; // Iterator at offset 0 } //--------------------------------------------------------------------- // Host testing subroutines //--------------------------------------------------------------------- /** * Run iterator test on device */ template < typename InputIteratorT, typename T, int TEST_VALUES> void Test( InputIteratorT d_in, T (&h_reference)[TEST_VALUES]) { // Allocate device arrays T *d_out = NULL; InputIteratorT *d_itrs = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * TEST_VALUES)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs, sizeof(InputIteratorT) * 2)); int compare; // Run unguarded kernel Kernel<<<1, 1>>>(d_in, d_out, d_itrs); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check results compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose); printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check iterator at offset 21 InputIteratorT h_itr = d_in + 21; compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose); printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check iterator at offset 0 compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose); printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_itrs) { CubDebugExit(g_allocator.DeviceFree(d_itrs)); } } /** * Test tex-ref texture iterator */ template void TestTexRef() { printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; constexpr unsigned int DUMMY_OFFSET = 500; constexpr unsigned int DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { RandomBits(h_data[i]); } // Allocate device arrays T *d_data = NULL; T *d_dummy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice)); // Initialize reference data T h_reference[8]; h_reference[0] = h_data[0]; // Value at offset 0 h_reference[1] = h_data[100]; // Value at offset 100 h_reference[2] = h_data[1000]; // Value at offset 1000 h_reference[3] = h_data[10000]; // Value at offset 10000 h_reference[4] = h_data[1]; // Value at offset 1 h_reference[5] = h_data[21]; // Value at offset 21 h_reference[6] = h_data[11]; // Value at offset 11 h_reference[7] = h_data[0]; // Value at offset 0; // Create and bind ref-based test iterator TexRefInputIterator d_ref_itr; CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES)); // Create and bind dummy iterator of same type to check with interferance TexRefInputIterator d_ref_itr2; CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); Test(d_ref_itr, h_reference); CubDebugExit(d_ref_itr.UnbindTexture()); CubDebugExit(d_ref_itr2.UnbindTexture()); if (h_data) { delete[] h_data; } if (d_data) { CubDebugExit(g_allocator.DeviceFree(d_data)); } if (d_dummy) { CubDebugExit(g_allocator.DeviceFree(d_dummy)); } } /** * Run non-integer tests */ template void Test() { TestTexRef(); } /** * Run tests */ template void Test() { // Test non-const type Test(); // Test non-const type Test(); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Evaluate different data types Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); printf("\nTest complete\n"); fflush(stdout); return 0; } cub-2.0.1/test/test_namespace_wrapped.cu000066400000000000000000000042151434614775400203310ustar00rootroot00000000000000// Wrap thrust and cub in different enclosing namespaces // (In practice, you probably want these to be the same, in which case just // set THRUST_CUB_WRAPPED_NAMESPACE to set both). #define THRUST_WRAPPED_NAMESPACE wrap_thrust #define CUB_WRAPPED_NAMESPACE wrap_cub // Enable error checking: #define CUB_STDERR #include #include #include #include #include #include "test_util.h" #include #include // Test that we can use a few common utilities and algorithms from wrapped // Thrust/CUB namespaces at runtime. More extensive testing is performed by the // header tests and the check_namespace.cmake test. int main(int argc, char **argv) { CommandLineArgs args(argc, argv); CubDebugExit(args.DeviceInit()); const std::size_t n = 2048; // Fill a vector with random data: ::wrap_thrust::thrust::host_vector h_input(n); for (auto &val : h_input) { RandomBits(val); } // Test the qualifier macro: THRUST_NS_QUALIFIER::device_vector d_input(h_input); THRUST_NS_QUALIFIER::device_vector d_output(n); std::size_t temp_storage_bytes{}; // Sort with DeviceRadixSort: auto error = ::wrap_cub::cub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()), ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()), static_cast(n)); CubDebugExit(error); ::wrap_thrust::thrust::device_vector temp_storage( temp_storage_bytes); // Test the CUB qualifier macro: error = CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys( ::wrap_thrust::thrust::raw_pointer_cast(temp_storage.data()), temp_storage_bytes, ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()), ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()), static_cast(n)); CubDebugExit(error); // Verify output: if (!::wrap_thrust::thrust::is_sorted(d_output.cbegin(), d_output.cend())) { std::cerr << "Output is not sorted!\n"; return EXIT_FAILURE; } return EXIT_SUCCESS; } cub-2.0.1/test/test_temporary_storage_layout.cu000066400000000000000000000166141434614775400220240ustar00rootroot00000000000000/******************************************************************************* * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include "cub/detail/temporary_storage.cuh" #include "test_util.h" #include template std::size_t GetTemporaryStorageSize(std::size_t (&sizes)[Items]) { void *pointers[Items]{}; std::size_t temp_storage_bytes{}; CubDebugExit( cub::AliasTemporaries(nullptr, temp_storage_bytes, pointers, sizes)); return temp_storage_bytes; } std::size_t GetActualZero() { std::size_t sizes[1]{}; return GetTemporaryStorageSize(sizes); } template void TestEmptyStorage() { cub::detail::temporary_storage::layout temporary_storage; AssertEquals(temporary_storage.get_size(), GetActualZero()); } template void TestPartiallyFilledStorage() { using target_type = std::uint64_t; constexpr std::size_t target_elements = 42; constexpr std::size_t full_slot_elements = target_elements * sizeof(target_type); constexpr std::size_t empty_slot_elements{}; cub::detail::temporary_storage::layout temporary_storage; std::unique_ptr> arrays[StorageSlots]; std::size_t sizes[StorageSlots]{}; for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { auto slot = temporary_storage.get_slot(slot_id); const std::size_t elements = slot_id % 2 == 0 ? full_slot_elements : empty_slot_elements; sizes[slot_id] = elements * sizeof(target_type); arrays[slot_id].reset( new cub::detail::temporary_storage::alias( slot->template create_alias(elements))); } const std::size_t temp_storage_bytes = temporary_storage.get_size(); std::unique_ptr temp_storage( new std::uint8_t[temp_storage_bytes]); temporary_storage.map_to_buffer(temp_storage.get(), temp_storage_bytes); AssertEquals(temp_storage_bytes, GetTemporaryStorageSize(sizes)); for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { if (slot_id % 2 == 0) { AssertTrue(arrays[slot_id]->get() != nullptr); } else { AssertTrue(arrays[slot_id]->get() == nullptr); } } } template void TestGrow() { using target_type = std::uint64_t; constexpr std::size_t target_elements_number = 42; cub::detail::temporary_storage::layout preset_layout; std::unique_ptr> preset_arrays[StorageSlots]; for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { preset_arrays[slot_id].reset( new cub::detail::temporary_storage::alias( preset_layout.get_slot(slot_id)->template create_alias( target_elements_number))); } cub::detail::temporary_storage::layout postset_layout; std::unique_ptr> postset_arrays[StorageSlots]; for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { postset_arrays[slot_id].reset( new cub::detail::temporary_storage::alias( postset_layout.get_slot(slot_id)->template create_alias())); postset_arrays[slot_id]->grow(target_elements_number); } AssertEquals(preset_layout.get_size(), postset_layout.get_size()); const std::size_t tmp_storage_bytes = preset_layout.get_size(); std::unique_ptr temp_storage( new std::uint8_t[tmp_storage_bytes]); preset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes); postset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes); for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { AssertEquals(postset_arrays[slot_id]->get(), preset_arrays[slot_id]->get()); } } template void TestDoubleGrow() { using target_type = std::uint64_t; constexpr std::size_t target_elements_number = 42; cub::detail::temporary_storage::layout preset_layout; std::unique_ptr> preset_arrays[StorageSlots]; for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { preset_arrays[slot_id].reset( new cub::detail::temporary_storage::alias( preset_layout.get_slot(slot_id)->template create_alias( 2 * target_elements_number))); } cub::detail::temporary_storage::layout postset_layout; std::unique_ptr> postset_arrays[StorageSlots]; for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { postset_arrays[slot_id].reset( new cub::detail::temporary_storage::alias( postset_layout.get_slot(slot_id)->template create_alias( target_elements_number))); postset_arrays[slot_id]->grow(2 * target_elements_number); } AssertEquals(preset_layout.get_size(), postset_layout.get_size()); const std::size_t tmp_storage_bytes = preset_layout.get_size(); std::unique_ptr temp_storage( new std::uint8_t[tmp_storage_bytes]); preset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes); postset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes); for (int slot_id = 0; slot_id < StorageSlots; slot_id++) { AssertEquals(postset_arrays[slot_id]->get(), preset_arrays[slot_id]->get()); } } template void Test() { TestEmptyStorage(); TestPartiallyFilledStorage(); TestGrow(); TestDoubleGrow(); } int main() { Test<1>(); Test<4>(); Test<42>(); } cub-2.0.1/test/test_thread_operators.cu000066400000000000000000000204501434614775400202170ustar00rootroot00000000000000/******************************************************************************* * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include "test_util.h" #include template T Make(int val) { return T{val}; } template class BaseT { protected: int m_val{}; public: BaseT(int val) : m_val{val} {} }; template <> class BaseT { protected: int m_val{}; public: BaseT(int val) : m_val{val} {} __host__ __device__ operator int() const { return m_val; } }; #define CUSTOM_TYPE_FACTORY(NAME, RT, OP, CONVERTABLE) \ class Custom##NAME##T : public BaseT \ { \ explicit Custom##NAME##T(int val) \ : BaseT(val) \ {} \ \ friend Custom##NAME##T Make(int); \ \ public: \ __host__ __device__ RT operator OP(int val) const \ { \ return m_val OP val; \ } \ } // NAME RT OP CONVERTABLE CUSTOM_TYPE_FACTORY(Eq, bool, ==, false); CUSTOM_TYPE_FACTORY(Ineq, bool, !=, false); CUSTOM_TYPE_FACTORY(Sum, int, +, false); CUSTOM_TYPE_FACTORY(Diff, int, -, false); CUSTOM_TYPE_FACTORY(Div, int, /, false); CUSTOM_TYPE_FACTORY(Gt, bool, >, true); CUSTOM_TYPE_FACTORY(Lt, bool, <, true); void TestEquality() { cub::Equality op{}; const int const_magic_val = 42; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, const_magic_val), true); AssertEquals(op(const_magic_val, magic_val), true); AssertEquals(op(const_magic_val, magic_val + 1), false); AssertEquals(op(Make(magic_val), magic_val), true); AssertEquals(op(Make(magic_val), magic_val + 1), false); } void TestInequality() { cub::Inequality op{}; const int const_magic_val = 42; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, const_magic_val), false); AssertEquals(op(const_magic_val, magic_val), false); AssertEquals(op(const_magic_val, magic_val + 1), true); AssertEquals(op(Make(magic_val), magic_val), false); AssertEquals(op(Make(magic_val), magic_val + 1), true); } void TestInequalityWrapper() { cub::Equality wrapped_op{}; cub::InequalityWrapper op{wrapped_op}; const int const_magic_val = 42; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, const_magic_val), false); AssertEquals(op(const_magic_val, magic_val), false); AssertEquals(op(const_magic_val, magic_val + 1), true); AssertEquals(op(Make(magic_val), magic_val), false); AssertEquals(op(Make(magic_val), magic_val + 1), true); } #define CUSTOM_SYNC_T(NAME, RT, OP) \ struct Custom ## NAME ## Sink \ { \ template \ __host__ __device__ RT operator OP (T &&) const \ { \ return RT{}; \ } \ } CUSTOM_SYNC_T(SumInt, int, +); CUSTOM_SYNC_T(SumCustomInt, CustomSumIntSink, +); CUSTOM_SYNC_T(DiffInt, int, -); CUSTOM_SYNC_T(DiffCustomInt, CustomDiffIntSink, -); CUSTOM_SYNC_T(DivInt, int, /); CUSTOM_SYNC_T(DivCustomInt, CustomDivIntSink, /); template void StaticSame() { static_assert(std::is_same::value, "shall match"); } void TestSum() { cub::Sum op{}; const int const_magic_val = 40; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, 2), 42); AssertEquals(op(magic_val, 2), 42); AssertEquals(op(Make(magic_val), 2), 42); StaticSame(); StaticSame(); StaticSame(); StaticSame(); } void TestDifference() { cub::Difference op{}; const int const_magic_val = 44; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, 2), 42); AssertEquals(op(magic_val, 2), 42); AssertEquals(op(Make(magic_val), 2), 42); StaticSame(); StaticSame(); StaticSame(); StaticSame(); } void TestDivision() { cub::Division op{}; const int const_magic_val = 44; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, 2), 22); AssertEquals(op(magic_val, 2), 22); AssertEquals(op(Make(magic_val), 2), 22); StaticSame(); StaticSame(); StaticSame(); StaticSame(); } void TestMax() { cub::Max op{}; const int const_magic_val = 42; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, 2), 42); AssertEquals(op(magic_val, 2), 42); AssertEquals(op(2, Make(magic_val)), 42); StaticSame(); StaticSame(); StaticSame(magic_val))), int>(); } void TestMin() { cub::Min op{}; const int const_magic_val = 42; int magic_val = const_magic_val; AssertEquals(op(const_magic_val, 2), 2); AssertEquals(op(magic_val, 2), 2); AssertEquals(op(2, Make(magic_val)), 2); StaticSame(); StaticSame(); StaticSame(magic_val))), int>(); } int main() { TestEquality(); TestInequality(); TestInequalityWrapper(); TestSum(); TestDifference(); TestDivision(); TestMax(); TestMin(); return 0; } cub-2.0.1/test/test_thread_sort.cu000066400000000000000000000117101434614775400171670ustar00rootroot00000000000000/******************************************************************************* * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include "test_util.h" #include "cub/thread/thread_sort.cuh" #include #include #include #include #include #include struct CustomLess { template __host__ __device__ bool operator()(DataType &lhs, DataType &rhs) { return lhs < rhs; } }; template __global__ void kernel(const KeyT *keys_in, KeyT *keys_out, const ValueT *values_in, ValueT *values_out) { KeyT thread_keys[ItemsPerThread]; KeyT thread_values[ItemsPerThread]; const auto thread_offset = ItemsPerThread * threadIdx.x; keys_in += thread_offset; keys_out += thread_offset; values_in += thread_offset; values_out += thread_offset; for (int item = 0; item < ItemsPerThread; item++) { thread_keys[item] = keys_in[item]; thread_values[item] = values_in[item]; } cub::StableOddEvenSort(thread_keys, thread_values, CustomLess{}); for (int item = 0; item < ItemsPerThread; item++) { keys_out[item] = thread_keys[item]; values_out[item] = thread_values[item]; } } template void Test() { const unsigned int threads_in_block = 1024; const unsigned int elements = threads_in_block * ItemsPerThread; thrust::default_random_engine re; thrust::device_vector data_source(elements); for (int iteration = 0; iteration < 10; iteration++) { thrust::sequence(data_source.begin(), data_source.end()); thrust::shuffle(data_source.begin(), data_source.end(), re); thrust::device_vector in_keys(data_source); thrust::device_vector out_keys(elements); thrust::shuffle(data_source.begin(), data_source.end(), re); thrust::device_vector in_values(data_source); thrust::device_vector out_values(elements); thrust::host_vector host_keys(in_keys); thrust::host_vector host_values(in_values); kernel<<<1, threads_in_block>>>( thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data()), thrust::raw_pointer_cast(in_values.data()), thrust::raw_pointer_cast(out_values.data())); for (unsigned int tid = 0; tid < threads_in_block; tid++) { const auto thread_begin = tid * ItemsPerThread; const auto thread_end = thread_begin + ItemsPerThread; thrust::sort_by_key(host_keys.begin() + thread_begin, host_keys.begin() + thread_end, host_values.begin() + thread_begin, CustomLess{}); } AssertEquals(host_keys, out_keys); AssertEquals(host_values, out_values); } } template void Test() { Test(); Test(); Test(); Test(); Test(); Test(); Test(); Test(); } int main() { Test(); Test(); return 0; } cub-2.0.1/test/test_util.h000066400000000000000000001716571434614775400154670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #if defined(_WIN32) || defined(_WIN64) #include #undef small // Windows is terrible for polluting macro namespace #else #include #endif #include #include #include #include #include #include #include #include #include #include "mersenne.h" #include "half.h" #include "bfloat16.h" #include #include #include #include #include #include #include #include #include /****************************************************************************** * Type conversion macros ******************************************************************************/ /** * Return a value of type `T` with the same bitwise representation of `in`. * Types `T` and `U` must be the same size. */ template T SafeBitCast(const U& in) { static_assert(sizeof(T) == sizeof(U), "Types must be same size."); T out; memcpy(&out, &in, sizeof(T)); return out; } /****************************************************************************** * Assertion macros ******************************************************************************/ /** * Assert equals */ #define AssertEquals(a, b) \ if ((a) != (b)) \ { \ std::cerr << "\n" \ << __FILE__ << ": " << __LINE__ \ << ": AssertEquals(" #a ", " #b ") failed.\n"; \ exit(1); \ } #define AssertTrue(a) \ if (!(a)) \ { \ std::cerr << "\n" \ << __FILE__ << ": " << __LINE__ \ << ": AssertTrue(" #a ") failed.\n"; \ exit(1); \ } /****************************************************************************** * Command-line parsing functionality ******************************************************************************/ /** * Utility for parsing command line arguments */ struct CommandLineArgs { std::vector keys; std::vector values; std::vector args; cudaDeviceProp deviceProp; float device_giga_bandwidth; std::size_t device_free_physmem; std::size_t device_total_physmem; /** * Constructor */ CommandLineArgs(int argc, char **argv) : keys(10), values(10) { using namespace std; // Initialize mersenne generator unsigned int mersenne_init[4]= {0x123, 0x234, 0x345, 0x456}; mersenne::init_by_array(mersenne_init, 4); for (int i = 1; i < argc; i++) { string arg = argv[i]; if ((arg[0] != '-') || (arg[1] != '-')) { args.push_back(arg); continue; } string::size_type pos; string key, val; if ((pos = arg.find('=')) == string::npos) { key = string(arg, 2, arg.length() - 2); val = ""; } else { key = string(arg, 2, pos - 2); val = string(arg, pos + 1, arg.length() - 1); } keys.push_back(key); values.push_back(val); } } /** * Checks whether a flag "--" is present in the commandline */ bool CheckCmdLineFlag(const char* arg_name) { using namespace std; for (std::size_t i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) return true; } return false; } /** * Returns number of naked (non-flag and non-key-value) commandline parameters */ template int NumNakedArgs() { return args.size(); } /** * Returns the commandline parameter for a given index (not including flags) */ template void GetCmdLineArgument(std::size_t index, T &val) { using namespace std; if (index < args.size()) { istringstream str_stream(args[index]); str_stream >> val; } } /** * Returns the value specified for a given commandline parameter --= */ template void GetCmdLineArgument(const char *arg_name, T &val) { using namespace std; for (std::size_t i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) { istringstream str_stream(values[i]); str_stream >> val; } } } /** * Returns the values specified for a given commandline parameter --=,* */ template void GetCmdLineArguments(const char *arg_name, std::vector &vals) { using namespace std; if (CheckCmdLineFlag(arg_name)) { // Clear any default values vals.clear(); // Recover from multi-value string for (std::size_t i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) { string val_string(values[i]); istringstream str_stream(val_string); string::size_type old_pos = 0; string::size_type new_pos = 0; // Iterate comma-separated values T val; while ((new_pos = val_string.find(',', old_pos)) != string::npos) { if (new_pos != old_pos) { str_stream.width(new_pos - old_pos); str_stream >> val; vals.push_back(val); } // skip over comma str_stream.ignore(1); old_pos = new_pos + 1; } // Read last value str_stream >> val; vals.push_back(val); } } } } /** * The number of pairs parsed */ int ParsedArgc() { return (int) keys.size(); } /** * Initialize device */ cudaError_t DeviceInit(int dev = -1) { cudaError_t error = cudaSuccess; do { int deviceCount; error = CubDebug(cudaGetDeviceCount(&deviceCount)); if (error) break; if (deviceCount == 0) { fprintf(stderr, "No devices supporting CUDA.\n"); exit(1); } if (dev < 0) { GetCmdLineArgument("device", dev); } if ((dev > deviceCount - 1) || (dev < 0)) { dev = 0; } error = CubDebug(cudaSetDevice(dev)); if (error) break; CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem)); int ptx_version = 0; error = CubDebug(CUB_NS_QUALIFIER::PtxVersion(ptx_version)); if (error) break; error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev)); if (error) break; if (deviceProp.major < 1) { fprintf(stderr, "Device does not support CUDA.\n"); exit(1); } device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000; if (!CheckCmdLineFlag("quiet")) { printf( "Using device %d: %s (PTX version %d, SM%d, %d SMs, " "%lld free / %lld total MB physmem, " "%.3f GB/s @ %d kHz mem clock, ECC %s)\n", dev, deviceProp.name, ptx_version, deviceProp.major * 100 + deviceProp.minor * 10, deviceProp.multiProcessorCount, (unsigned long long) device_free_physmem / 1024 / 1024, (unsigned long long) device_total_physmem / 1024 / 1024, device_giga_bandwidth, deviceProp.memoryClockRate, (deviceProp.ECCEnabled) ? "on" : "off"); fflush(stdout); } } while (0); return error; } }; // Gets the amount of global memory of the current device. std::size_t TotalGlobalMem() { int device = 0; CubDebugExit(cudaGetDevice(&device)); std::size_t free_mem = 0, total_mem = 0; CubDebugExit(cudaMemGetInfo(&free_mem, &total_mem)); return total_mem; } /****************************************************************************** * Random bits generator ******************************************************************************/ int g_num_rand_samples = 0; template bool IsNaN(T /* val */) { return false; } template<> __noinline__ bool IsNaN(float val) { return std::isnan(val); } template<> __noinline__ bool IsNaN(float1 val) { return (IsNaN(val.x)); } template<> __noinline__ bool IsNaN(float2 val) { return (IsNaN(val.y) || IsNaN(val.x)); } template<> __noinline__ bool IsNaN(float3 val) { return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); } template<> __noinline__ bool IsNaN(float4 val) { return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); } template<> __noinline__ bool IsNaN(double val) { return std::isnan(val); } template<> __noinline__ bool IsNaN(double1 val) { return (IsNaN(val.x)); } template<> __noinline__ bool IsNaN(double2 val) { return (IsNaN(val.y) || IsNaN(val.x)); } template<> __noinline__ bool IsNaN(double3 val) { return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); } template<> __noinline__ bool IsNaN(double4 val) { return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); } template<> __noinline__ bool IsNaN(half_t val) { const auto bits = SafeBitCast(val); // commented bit is always true, leaving for documentation: return (((bits >= 0x7C01) && (bits <= 0x7FFF)) || ((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/)); } template<> __noinline__ bool IsNaN(bfloat16_t val) { const auto bits = SafeBitCast(val); // commented bit is always true, leaving for documentation: return (((bits >= 0x7F81) && (bits <= 0x7FFF)) || ((bits >= 0xFF81) /*&& (bits <= 0xFFFFFFFF)*/)); } /** * Generates random keys. * * We always take the second-order byte from rand() because the higher-order * bits returned by rand() are commonly considered more uniformly distributed * than the lower-order bits. * * We can decrease the entropy level of keys by adopting the technique * of Thearling and Smith in which keys are computed from the bitwise AND of * multiple random samples: * * entropy_reduction | Effectively-unique bits per key * ----------------------------------------------------- * -1 | 0 * 0 | 32 * 1 | 25.95 (81%) * 2 | 17.41 (54%) * 3 | 10.78 (34%) * 4 | 6.42 (20%) * ... | ... * */ template void RandomBits( K &key, int entropy_reduction = 0, int begin_bit = 0, int end_bit = sizeof(K) * 8) { const int NUM_BYTES = sizeof(K); const int WORD_BYTES = sizeof(unsigned int); const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES; unsigned int word_buff[NUM_WORDS]; if (entropy_reduction == -1) { memset((void *) &key, 0, sizeof(key)); return; } if (end_bit < 0) end_bit = sizeof(K) * 8; while (true) { // Generate random word_buff for (int j = 0; j < NUM_WORDS; j++) { int current_bit = j * WORD_BYTES * 8; unsigned int word = 0xffffffff; word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit); word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit); for (int i = 0; i <= entropy_reduction; i++) { // Grab some of the higher bits from rand (better entropy, supposedly) word &= mersenne::genrand_int32(); g_num_rand_samples++; } word_buff[j] = word; } memcpy(&key, word_buff, sizeof(K)); K copy = key; if (!IsNaN(copy)) break; // avoids NaNs when generating random floating point numbers } } /// Randomly select number between [0:max) template T RandomValue(T max) { unsigned int bits; unsigned int max_int = (unsigned int) -1; do { RandomBits(bits); } while (bits == max_int); return (T) ((double(bits) / double(max_int)) * double(max)); } /****************************************************************************** * Console printing utilities ******************************************************************************/ /** * Helper for casting character types to integers for cout printing */ template T CoutCast(T val) { return val; } int CoutCast(char val) { return val; } int CoutCast(unsigned char val) { return val; } int CoutCast(signed char val) { return val; } /****************************************************************************** * Test value initialization utilities ******************************************************************************/ /** * Test problem generation options */ enum GenMode { UNIFORM, // Assign to '2', regardless of integer seed INTEGER_SEED, // Assign to integer seed RANDOM, // Assign to random, regardless of integer seed RANDOM_BIT, // Assign to randomly chosen 0 or 1, regardless of integer seed RANDOM_MINUS_PLUS_ZERO, // Assign to random, with some values being -0.0 or +0.0 patterns }; /** * Initialize value */ #pragma nv_exec_check_disable template __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) { // RandomBits is host-only. NV_IF_TARGET( NV_IS_HOST, ( switch (gen_mode) { case RANDOM: RandomBits(value); break; case RANDOM_BIT: { char c; RandomBits(c, 0, 0, 1); value = static_cast((c > 0) ? 1 : -1); break; } case RANDOM_MINUS_PLUS_ZERO: { // Replace roughly 1/128 of values with -0.0 or +0.0, and // generate the rest randomly using UnsignedBits = typename CUB_NS_QUALIFIER::Traits::UnsignedBits; char c; RandomBits(c); if (c == 0) { // Replace 1/256 of values with +0.0 bit pattern value = SafeBitCast(UnsignedBits(0)); } else if (c == 1) { // Replace 1/256 of values with -0.0 bit pattern value = SafeBitCast( UnsignedBits(UnsignedBits(1) << (sizeof(UnsignedBits) * 8) - 1)); } else { // 127/128 of values are random RandomBits(value); } break; } case UNIFORM: value = 2; break; case INTEGER_SEED: default: value = static_cast(index); break; }), ( // NV_IS_DEVICE: switch (gen_mode) { case RANDOM: case RANDOM_BIT: case RANDOM_MINUS_PLUS_ZERO: _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); CUB_NS_QUALIFIER::ThreadTrap(); break; case UNIFORM: value = 2; break; case INTEGER_SEED: default: value = static_cast(index); break; } )); } /** * Initialize value (bool) */ #pragma nv_exec_check_disable __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, std::size_t index = 0) { // RandomBits is host-only. NV_IF_TARGET( NV_IS_HOST, ( switch (gen_mode) { case RANDOM: case RANDOM_BIT: char c; RandomBits(c, 0, 0, 1); value = (c > 0); break; case UNIFORM: value = true; break; case INTEGER_SEED: default: value = (index > 0); break; } ), ( // NV_IS_DEVICE, switch (gen_mode) { case RANDOM: case RANDOM_BIT: case RANDOM_MINUS_PLUS_ZERO: _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); CUB_NS_QUALIFIER::ThreadTrap(); break; case UNIFORM: value = true; break; case INTEGER_SEED: default: value = (index > 0); break; } )); } /** * cub::NullType test initialization */ __host__ __device__ __forceinline__ void InitValue(GenMode /* gen_mode */, CUB_NS_QUALIFIER::NullType &/* value */, std::size_t /* index */ = 0) {} /** * cub::KeyValuePairtest initialization */ #pragma nv_exec_check_disable template __host__ __device__ __forceinline__ void InitValue( GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair& value, std::size_t index = 0) { InitValue(gen_mode, value.value, index); // This specialization only appears to be used by test_warp_scan. // It initializes with uniform values and random keys, so we need to // protect the call to the host-only RandomBits. // clang-format off NV_IF_TARGET(NV_IS_HOST, ( // Assign corresponding flag with a likelihood of the last bit // being set with entropy-reduction level 3 RandomBits(value.key, 3); value.key = (value.key & 0x1); ), ( // NV_IS_DEVICE _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); CUB_NS_QUALIFIER::ThreadTrap(); )); // clang-format on } /****************************************************************************** * Comparison and ostream operators ******************************************************************************/ /** * KeyValuePair ostream operator */ template std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair &val) { os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')'; return os; } /****************************************************************************** * Comparison and ostream operators for CUDA vector types ******************************************************************************/ /** * Vector1 overloads */ #define CUB_VEC_OVERLOAD_1(T, BaseT) \ /* Ostream output */ \ std::ostream& operator<<( \ std::ostream& os, \ const T& val) \ { \ os << '(' << CoutCast(val.x) << ')'; \ return os; \ } \ /* Inequality */ \ __host__ __device__ __forceinline__ bool operator!=( \ const T &a, \ const T &b) \ { \ return (a.x != b.x); \ } \ /* Equality */ \ __host__ __device__ __forceinline__ bool operator==( \ const T &a, \ const T &b) \ { \ return (a.x == b.x); \ } \ /* Test initialization */ \ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \ { \ InitValue(gen_mode, value.x, index); \ } \ /* Max */ \ __host__ __device__ __forceinline__ bool operator>( \ const T &a, \ const T &b) \ { \ return (a.x > b.x); \ } \ /* Min */ \ __host__ __device__ __forceinline__ bool operator<( \ const T &a, \ const T &b) \ { \ return (a.x < b.x); \ } \ /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ __host__ __device__ __forceinline__ T operator+( \ T a, \ T b) \ { \ T retval = make_##T(a.x + b.x); \ return retval; \ } \ CUB_NAMESPACE_BEGIN \ template<> \ struct NumericTraits \ { \ static const Category CATEGORY = NOT_A_NUMBER; \ enum { \ PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max()}; \ return retval; \ } \ static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest()}; \ return retval; \ } \ }; \ CUB_NAMESPACE_END /** * Vector2 overloads */ #define CUB_VEC_OVERLOAD_2(T, BaseT) \ /* Ostream output */ \ std::ostream& operator<<( \ std::ostream& os, \ const T& val) \ { \ os << '(' \ << CoutCast(val.x) << ',' \ << CoutCast(val.y) << ')'; \ return os; \ } \ /* Inequality */ \ __host__ __device__ __forceinline__ bool operator!=( \ const T &a, \ const T &b) \ { \ return (a.x != b.x) || \ (a.y != b.y); \ } \ /* Equality */ \ __host__ __device__ __forceinline__ bool operator==( \ const T &a, \ const T &b) \ { \ return (a.x == b.x) && \ (a.y == b.y); \ } \ /* Test initialization */ \ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \ { \ InitValue(gen_mode, value.x, index); \ InitValue(gen_mode, value.y, index); \ } \ /* Max */ \ __host__ __device__ __forceinline__ bool operator>( \ const T &a, \ const T &b) \ { \ if (a.x > b.x) return true; else if (b.x > a.x) return false; \ return a.y > b.y; \ } \ /* Min */ \ __host__ __device__ __forceinline__ bool operator<( \ const T &a, \ const T &b) \ { \ if (a.x < b.x) return true; else if (b.x < a.x) return false; \ return a.y < b.y; \ } \ /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ __host__ __device__ __forceinline__ T operator+( \ T a, \ T b) \ { \ T retval = make_##T( \ a.x + b.x, \ a.y + b.y); \ return retval; \ } \ CUB_NAMESPACE_BEGIN \ template<> \ struct NumericTraits \ { \ static const Category CATEGORY = NOT_A_NUMBER; \ enum { \ PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max(), \ NumericTraits::Max()}; \ return retval; \ } \ static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest(), \ NumericTraits::Lowest()}; \ return retval; \ } \ }; \ CUB_NAMESPACE_END /** * Vector3 overloads */ #define CUB_VEC_OVERLOAD_3(T, BaseT) \ /* Ostream output */ \ std::ostream& operator<<( \ std::ostream& os, \ const T& val) \ { \ os << '(' \ << CoutCast(val.x) << ',' \ << CoutCast(val.y) << ',' \ << CoutCast(val.z) << ')'; \ return os; \ } \ /* Inequality */ \ __host__ __device__ __forceinline__ bool operator!=( \ const T &a, \ const T &b) \ { \ return (a.x != b.x) || \ (a.y != b.y) || \ (a.z != b.z); \ } \ /* Equality */ \ __host__ __device__ __forceinline__ bool operator==( \ const T &a, \ const T &b) \ { \ return (a.x == b.x) && \ (a.y == b.y) && \ (a.z == b.z); \ } \ /* Test initialization */ \ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \ { \ InitValue(gen_mode, value.x, index); \ InitValue(gen_mode, value.y, index); \ InitValue(gen_mode, value.z, index); \ } \ /* Max */ \ __host__ __device__ __forceinline__ bool operator>( \ const T &a, \ const T &b) \ { \ if (a.x > b.x) return true; else if (b.x > a.x) return false; \ if (a.y > b.y) return true; else if (b.y > a.y) return false; \ return a.z > b.z; \ } \ /* Min */ \ __host__ __device__ __forceinline__ bool operator<( \ const T &a, \ const T &b) \ { \ if (a.x < b.x) return true; else if (b.x < a.x) return false; \ if (a.y < b.y) return true; else if (b.y < a.y) return false; \ return a.z < b.z; \ } \ /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ __host__ __device__ __forceinline__ T operator+( \ T a, \ T b) \ { \ T retval = make_##T( \ a.x + b.x, \ a.y + b.y, \ a.z + b.z); \ return retval; \ } \ CUB_NAMESPACE_BEGIN \ template<> \ struct NumericTraits \ { \ static const Category CATEGORY = NOT_A_NUMBER; \ enum { \ PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max(), \ NumericTraits::Max(), \ NumericTraits::Max()}; \ return retval; \ } \ static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest(), \ NumericTraits::Lowest(), \ NumericTraits::Lowest()}; \ return retval; \ } \ }; \ CUB_NAMESPACE_END /** * Vector4 overloads */ #define CUB_VEC_OVERLOAD_4(T, BaseT) \ /* Ostream output */ \ std::ostream& operator<<( \ std::ostream& os, \ const T& val) \ { \ os << '(' \ << CoutCast(val.x) << ',' \ << CoutCast(val.y) << ',' \ << CoutCast(val.z) << ',' \ << CoutCast(val.w) << ')'; \ return os; \ } \ /* Inequality */ \ __host__ __device__ __forceinline__ bool operator!=( \ const T &a, \ const T &b) \ { \ return (a.x != b.x) || \ (a.y != b.y) || \ (a.z != b.z) || \ (a.w != b.w); \ } \ /* Equality */ \ __host__ __device__ __forceinline__ bool operator==( \ const T &a, \ const T &b) \ { \ return (a.x == b.x) && \ (a.y == b.y) && \ (a.z == b.z) && \ (a.w == b.w); \ } \ /* Test initialization */ \ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \ { \ InitValue(gen_mode, value.x, index); \ InitValue(gen_mode, value.y, index); \ InitValue(gen_mode, value.z, index); \ InitValue(gen_mode, value.w, index); \ } \ /* Max */ \ __host__ __device__ __forceinline__ bool operator>( \ const T &a, \ const T &b) \ { \ if (a.x > b.x) return true; else if (b.x > a.x) return false; \ if (a.y > b.y) return true; else if (b.y > a.y) return false; \ if (a.z > b.z) return true; else if (b.z > a.z) return false; \ return a.w > b.w; \ } \ /* Min */ \ __host__ __device__ __forceinline__ bool operator<( \ const T &a, \ const T &b) \ { \ if (a.x < b.x) return true; else if (b.x < a.x) return false; \ if (a.y < b.y) return true; else if (b.y < a.y) return false; \ if (a.z < b.z) return true; else if (b.z < a.z) return false; \ return a.w < b.w; \ } \ /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ __host__ __device__ __forceinline__ T operator+( \ T a, \ T b) \ { \ T retval = make_##T( \ a.x + b.x, \ a.y + b.y, \ a.z + b.z, \ a.w + b.w); \ return retval; \ } \ CUB_NAMESPACE_BEGIN \ template<> \ struct NumericTraits \ { \ static const Category CATEGORY = NOT_A_NUMBER; \ enum { \ PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max(), \ NumericTraits::Max(), \ NumericTraits::Max(), \ NumericTraits::Max()}; \ return retval; \ } \ static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest(), \ NumericTraits::Lowest(), \ NumericTraits::Lowest(), \ NumericTraits::Lowest()}; \ return retval; \ } \ }; \ CUB_NAMESPACE_END /** * All vector overloads */ #define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT) \ CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT) \ CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT) \ CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT) \ CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT) /** * Define for types */ CUB_VEC_OVERLOAD(char, signed char) CUB_VEC_OVERLOAD(short, short) CUB_VEC_OVERLOAD(int, int) CUB_VEC_OVERLOAD(long, long) CUB_VEC_OVERLOAD(longlong, long long) CUB_VEC_OVERLOAD(uchar, unsigned char) CUB_VEC_OVERLOAD(ushort, unsigned short) CUB_VEC_OVERLOAD(uint, unsigned int) CUB_VEC_OVERLOAD(ulong, unsigned long) CUB_VEC_OVERLOAD(ulonglong, unsigned long long) CUB_VEC_OVERLOAD(float, float) CUB_VEC_OVERLOAD(double, double) //--------------------------------------------------------------------- // Complex data type TestFoo //--------------------------------------------------------------------- /** * TestFoo complex data type */ struct TestFoo { using x_t = long long; using y_t = int; using z_t = short; using w_t = char; x_t x; y_t y; z_t z; w_t w; // Factory static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w) { TestFoo retval = {x, y, z, w}; return retval; } // Assignment from int operator __host__ __device__ __forceinline__ TestFoo& operator =(int b) { x = static_cast(b); y = static_cast(b); z = static_cast(b); w = static_cast(b); return *this; } // Summation operator __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const { return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w); } // Inequality operator __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const { return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w); } // Equality operator __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const { return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w); } // Less than operator __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const { if (x < b.x) return true; else if (b.x < x) return false; if (y < b.y) return true; else if (b.y < y) return false; if (z < b.z) return true; else if (b.z < z) return false; return w < b.w; } // Greater than operator __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const { if (x > b.x) return true; else if (b.x > x) return false; if (y > b.y) return true; else if (b.y > y) return false; if (z > b.z) return true; else if (b.z > z) return false; return w > b.w; } }; /** * TestFoo ostream operator */ std::ostream& operator<<(std::ostream& os, const TestFoo& val) { os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')'; return os; } /** * TestFoo test initialization */ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, std::size_t index = 0) { InitValue(gen_mode, value.x, index); InitValue(gen_mode, value.y, index); InitValue(gen_mode, value.z, index); InitValue(gen_mode, value.w, index); } /// numeric_limits specialization CUB_NAMESPACE_BEGIN template<> struct NumericTraits { static const Category CATEGORY = NOT_A_NUMBER; enum { PRIMITIVE = false, NULL_TYPE = false, }; __host__ __device__ static TestFoo Max() { return TestFoo::MakeTestFoo( NumericTraits::Max(), NumericTraits::Max(), NumericTraits::Max(), NumericTraits::Max()); } __host__ __device__ static TestFoo Lowest() { return TestFoo::MakeTestFoo( NumericTraits::Lowest(), NumericTraits::Lowest(), NumericTraits::Lowest(), NumericTraits::Lowest()); } }; CUB_NAMESPACE_END //--------------------------------------------------------------------- // Complex data type TestBar (with optimizations for fence-free warp-synchrony) //--------------------------------------------------------------------- /** * TestBar complex data type */ struct TestBar { long long x; int y; // Constructor __host__ __device__ __forceinline__ TestBar() : x(0), y(0) {} // Constructor __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b) {} // Constructor __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y) {} // Assignment from int operator __host__ __device__ __forceinline__ TestBar& operator =(int b) { x = b; y = b; return *this; } // Summation operator __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const { return TestBar(x + b.x, y + b.y); } // Inequality operator __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const { return (x != b.x) || (y != b.y); } // Equality operator __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const { return (x == b.x) && (y == b.y); } // Less than operator __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const { if (x < b.x) return true; else if (b.x < x) return false; return y < b.y; } // Greater than operator __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const { if (x > b.x) return true; else if (b.x > x) return false; return y > b.y; } }; /** * TestBar ostream operator */ std::ostream& operator<<(std::ostream& os, const TestBar& val) { os << '(' << val.x << ',' << val.y << ')'; return os; } /** * TestBar test initialization */ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, std::size_t index = 0) { InitValue(gen_mode, value.x, index); InitValue(gen_mode, value.y, index); } /// numeric_limits specialization CUB_NAMESPACE_BEGIN template<> struct NumericTraits { static const Category CATEGORY = NOT_A_NUMBER; enum { PRIMITIVE = false, NULL_TYPE = false, }; __host__ __device__ static TestBar Max() { return TestBar( NumericTraits::Max(), NumericTraits::Max()); } __host__ __device__ static TestBar Lowest() { return TestBar( NumericTraits::Lowest(), NumericTraits::Lowest()); } }; CUB_NAMESPACE_END /****************************************************************************** * Helper routines for list comparison and display ******************************************************************************/ /** * Compares the equivalence of two arrays */ template int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true) { for (OffsetT i = 0; i < len; i++) { if (computed[i] != reference[i]) { if (verbose) std::cout << "INCORRECT: [" << i << "]: " << CoutCast(computed[i]) << " != " << CoutCast(reference[i]); return 1; } } return 0; } /** * Compares the equivalence of two arrays */ template int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true) { for (OffsetT i = 0; i < len; i++) { if (computed[i] != reference[i]) { float difference = std::abs(computed[i]-reference[i]); float fraction = difference / std::abs(reference[i]); if (fraction > 0.00015) { if (verbose) std::cout << "INCORRECT: [" << i << "]: " << "(computed) " << CoutCast(computed[i]) << " != " << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; return 1; } } } return 0; } /** * Compares the equivalence of two arrays */ template int CompareResults(CUB_NS_QUALIFIER::NullType* computed, CUB_NS_QUALIFIER::NullType* reference, OffsetT len, bool verbose = true) { return 0; } /** * Compares the equivalence of two arrays */ template int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true) { for (OffsetT i = 0; i < len; i++) { if (computed[i] != reference[i]) { double difference = std::abs(computed[i]-reference[i]); double fraction = difference / std::abs(reference[i]); if (fraction > 0.00015) { if (verbose) std::cout << "INCORRECT: [" << i << "]: " << CoutCast(computed[i]) << " != " << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; return 1; } } } return 0; } /** * Verify the contents of a device array match those * of a host array */ int CompareDeviceResults( CUB_NS_QUALIFIER::NullType */* h_reference */, CUB_NS_QUALIFIER::NullType */* d_data */, std::size_t /* num_items */, bool /* verbose */ = true, bool /* display_data */ = false) { return 0; } /** * Verify the contents of a device array match those * of a host array */ template int CompareDeviceResults( S */*h_reference*/, CUB_NS_QUALIFIER::DiscardOutputIterator /*d_data*/, std::size_t /*num_items*/, bool /*verbose*/ = true, bool /*display_data*/ = false) { return 0; } /** * Verify the contents of a device array match those * of a host array */ template int CompareDeviceResults( S *h_reference, T *d_data, std::size_t num_items, bool verbose = true, bool display_data = false) { if (num_items == 0) { return 0; } // Allocate array on host T *h_data = (T*) malloc(num_items * sizeof(T)); // Copy data back cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); // Display data if (display_data) { printf("Reference:\n"); for (std::size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_reference[i]) << ", "; } printf("\n\nComputed:\n"); for (std::size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_data[i]) << ", "; } printf("\n\n"); } // Check int retval = CompareResults(h_data, h_reference, num_items, verbose); // Cleanup if (h_data) free(h_data); return retval; } /** * Verify the contents of a device array match those * of a device array */ template int CompareDeviceDeviceResults( T *d_reference, T *d_data, std::size_t num_items, bool verbose = true, bool display_data = false) { // Allocate array on host T *h_reference = (T*) malloc(num_items * sizeof(T)); T *h_data = (T*) malloc(num_items * sizeof(T)); // Copy data back cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost); cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); // Display data if (display_data) { printf("Reference:\n"); for (std::size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_reference[i]) << ", "; } printf("\n\nComputed:\n"); for (std::size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_data[i]) << ", "; } printf("\n\n"); } // Check int retval = CompareResults(h_data, h_reference, num_items, verbose); // Cleanup if (h_reference) free(h_reference); if (h_data) free(h_data); return retval; } /** * Print the contents of a host array */ void DisplayResults( CUB_NS_QUALIFIER::NullType */* h_data */, std::size_t /* num_items */) {} /** * Print the contents of a host array */ template void DisplayResults( InputIteratorT h_data, std::size_t num_items) { // Display data for (std::size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_data[i]) << ", "; } printf("\n"); } /** * Print the contents of a device array */ template void DisplayDeviceResults( T *d_data, std::size_t num_items) { // Allocate array on host T *h_data = (T*) malloc(num_items * sizeof(T)); // Copy data back cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); DisplayResults(h_data, num_items); // Cleanup if (h_data) free(h_data); } /****************************************************************************** * Segment descriptor generation ******************************************************************************/ /** * Initialize segments */ template void InitializeSegments( OffsetT num_items, int num_segments, OffsetT *h_segment_offsets, bool verbose = false) { if (num_segments <= 0) return; OffsetT expected_segment_length = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, OffsetT(num_segments)); OffsetT offset = 0; for (int i = 0; i < num_segments; ++i) { h_segment_offsets[i] = offset; OffsetT segment_length = RandomValue((expected_segment_length * 2) + 1); offset += segment_length; offset = CUB_MIN(offset, num_items); } h_segment_offsets[num_segments] = num_items; if (verbose) { printf("Segment offsets: "); DisplayResults(h_segment_offsets, num_segments + 1); } } /****************************************************************************** * Timing ******************************************************************************/ struct CpuTimer { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER ll_freq; LARGE_INTEGER ll_start; LARGE_INTEGER ll_stop; CpuTimer() { QueryPerformanceFrequency(&ll_freq); } void Start() { QueryPerformanceCounter(&ll_start); } void Stop() { QueryPerformanceCounter(&ll_stop); } float ElapsedMillis() { double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart); double stop = double(ll_stop.QuadPart) / double(ll_freq.QuadPart); return float((stop - start) * 1000); } #else rusage start; rusage stop; void Start() { getrusage(RUSAGE_SELF, &start); } void Stop() { getrusage(RUSAGE_SELF, &stop); } float ElapsedMillis() { float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec; float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec; return (sec * 1000) + (usec / 1000); } #endif }; struct GpuTimer { cudaEvent_t start; cudaEvent_t stop; GpuTimer() { cudaEventCreate(&start); cudaEventCreate(&stop); } ~GpuTimer() { cudaEventDestroy(start); cudaEventDestroy(stop); } void Start() { cudaEventRecord(start, 0); } void Stop() { cudaEventRecord(stop, 0); } float ElapsedMillis() { float elapsed; cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); return elapsed; } }; struct HugeDataType { static constexpr int ELEMENTS_PER_OBJECT = 128; __device__ __host__ HugeDataType() { for (int i = 0; i < ELEMENTS_PER_OBJECT; i++) { data[i] = 0; } } __device__ __host__ HugeDataType(const HugeDataType&rhs) { for (int i = 0; i < ELEMENTS_PER_OBJECT; i++) { data[i] = rhs.data[i]; } } explicit __device__ __host__ HugeDataType(int val) { for (int i = 0; i < ELEMENTS_PER_OBJECT; i++) { data[i] = val; } } int data[ELEMENTS_PER_OBJECT]; }; inline __device__ __host__ bool operator==(const HugeDataType &lhs, const HugeDataType &rhs) { for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++) { if (lhs.data[i] != rhs.data[i]) { return false; } } return true; } inline __device__ __host__ bool operator<(const HugeDataType &lhs, const HugeDataType &rhs) { for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++) { if (lhs.data[i] < rhs.data[i]) { return true; } } return false; } template __device__ __host__ bool operator!=(const HugeDataType &lhs, const DataType &rhs) { for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++) { if (lhs.data[i] != rhs) { return true; } } return false; } template void FillStriped(IteratorT it) { using T = CUB_NS_QUALIFIER::detail::value_t; const int warps_in_block = BlockThreads / LogicalWarpThreads; const int items_per_warp = LogicalWarpThreads * ItemsPerThread; for (int warp_id = 0; warp_id < warps_in_block; warp_id++) { const T warp_offset_val = static_cast(items_per_warp * warp_id); for (int lane_id = 0; lane_id < LogicalWarpThreads; lane_id++) { const T lane_offset = warp_offset_val + static_cast(lane_id); for (int item = 0; item < ItemsPerThread; item++) { *(it++) = lane_offset + static_cast(item * LogicalWarpThreads); } } } } cub-2.0.1/test/test_warp_exchange.cu000066400000000000000000000330311434614775400174640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include "test_util.h" #include "cub/warp/warp_exchange.cuh" #include #include #include #include template __global__ void kernel(const InputT *input_data, OutputT *output_data, ActionT action, cub::Int2Type /* same_type */) { using WarpExchangeT = cub::WarpExchange; constexpr int tile_size = ItemsPerThread * LogicalWarpThreads; constexpr int warps_per_block = BlockThreads / LogicalWarpThreads; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; const int warp_id = threadIdx.x / LogicalWarpThreads; const int lane_id = threadIdx.x % LogicalWarpThreads; WarpExchangeT exchange(temp_storage[warp_id]); InputT input[ItemsPerThread]; input_data += warp_id * tile_size; output_data += warp_id * tile_size; for (int item = 0; item < ItemsPerThread; item++) { input[item] = input_data[lane_id * ItemsPerThread + item]; } action(input, input, exchange); for (int item = 0; item < ItemsPerThread; item++) { output_data[lane_id * ItemsPerThread + item] = input[item]; } } template __global__ void kernel(const InputT *input_data, OutputT *output_data, ActionT action, cub::Int2Type /* different_types */) { using WarpExchangeT = cub::WarpExchange; constexpr int tile_size = ItemsPerThread * LogicalWarpThreads; constexpr int warps_per_block = BlockThreads / LogicalWarpThreads; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; const int warp_id = threadIdx.x / LogicalWarpThreads; const int lane_id = threadIdx.x % LogicalWarpThreads; WarpExchangeT exchange(temp_storage[warp_id]); InputT input[ItemsPerThread]; OutputT output[ItemsPerThread]; input_data += warp_id * tile_size; output_data += warp_id * tile_size; for (int item = 0; item < ItemsPerThread; item++) { input[item] = input_data[lane_id * ItemsPerThread + item]; } action(input, output, exchange); for (int item = 0; item < ItemsPerThread; item++) { output_data[lane_id * ItemsPerThread + item] = output[item]; } } struct StripedToBlocked { template __device__ void operator()( InputT (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], cub::WarpExchange &exchange) { exchange.StripedToBlocked(input, output); } }; struct BlockedToStriped { template __device__ void operator()( InputT (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], cub::WarpExchange &exchange) { exchange.BlockedToStriped(input, output); } }; template bool Compare( const thrust::device_vector &lhs, const thrust::device_vector &rhs) { auto err = thrust::mismatch(lhs.begin(), lhs.end(), rhs.begin()); if (err.first != lhs.end()) { auto i = thrust::distance(lhs.begin(), err.first); std::cerr << "Mismatch at " << i << ": " << lhs[i] << " != " << rhs[i] << std::endl; return false; } return true; } template void TestStripedToBlocked(thrust::device_vector &input, thrust::device_vector &output) { thrust::fill(output.begin(), output.end(), OutputT{0}); thrust::host_vector h_input(input.size()); FillStriped( h_input.begin()); input = h_input; kernel <<<1, BlockThreads>>>(thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), StripedToBlocked{}, cub::Int2Type::value>{}); cudaDeviceSynchronize(); thrust::device_vector expected_output(output.size()); thrust::sequence(expected_output.begin(), expected_output.end()); AssertTrue(Compare(expected_output, output)); } template void TestBlockedToStriped(thrust::device_vector &input, thrust::device_vector &output) { thrust::fill(output.begin(), output.end(), OutputT{0}); thrust::host_vector expected_output(input.size()); FillStriped( expected_output.begin()); thrust::sequence(input.begin(), input.end()); kernel <<<1, BlockThreads>>>(thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), BlockedToStriped{}, cub::Int2Type::value>{}); cudaDeviceSynchronize(); thrust::device_vector d_expected_output(expected_output); AssertTrue(Compare(d_expected_output, output)); } template __global__ void scatter_kernel(const InputT *input_data, OutputT *output_data, cub::Int2Type /* same_type */) { using WarpExchangeT = cub::WarpExchange; constexpr int tile_size = ItemsPerThread * LogicalWarpThreads; constexpr int warps_per_block = BlockThreads / LogicalWarpThreads; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; const int warp_id = threadIdx.x / LogicalWarpThreads; const int lane_id = threadIdx.x % LogicalWarpThreads; WarpExchangeT exchange(temp_storage[warp_id]); InputT input[ItemsPerThread]; // Reverse data int ranks[ItemsPerThread]; input_data += warp_id * tile_size; output_data += warp_id * tile_size; for (int item = 0; item < ItemsPerThread; item++) { const auto item_idx = lane_id * ItemsPerThread + item; input[item] = input_data[item_idx]; ranks[item] = tile_size - 1 - item_idx; } exchange.ScatterToStriped(input, ranks); // Striped to blocked for (int item = 0; item < ItemsPerThread; item++) { output_data[item * LogicalWarpThreads + lane_id] = input[item]; } } template __global__ void scatter_kernel(const InputT *input_data, OutputT *output_data, cub::Int2Type /* different_types */) { using WarpExchangeT = cub::WarpExchange; constexpr int tile_size = ItemsPerThread * LogicalWarpThreads; constexpr int warps_per_block = BlockThreads / LogicalWarpThreads; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; const int warp_id = threadIdx.x / LogicalWarpThreads; const int lane_id = threadIdx.x % LogicalWarpThreads; WarpExchangeT exchange(temp_storage[warp_id]); InputT input[ItemsPerThread]; OutputT output[ItemsPerThread]; // Reverse data int ranks[ItemsPerThread]; input_data += warp_id * tile_size; output_data += warp_id * tile_size; for (int item = 0; item < ItemsPerThread; item++) { const auto item_idx = lane_id * ItemsPerThread + item; input[item] = input_data[item_idx]; ranks[item] = tile_size - 1 - item_idx; } exchange.ScatterToStriped(input, output, ranks); // Striped to blocked for (int item = 0; item < ItemsPerThread; item++) { output_data[item * LogicalWarpThreads + lane_id] = output[item]; } } template void TestScatterToStriped(thrust::device_vector &input, thrust::device_vector &output) { thrust::fill(output.begin(), output.end(), OutputT{0}); thrust::sequence(input.begin(), input.end()); scatter_kernel <<<1, BlockThreads>>>(thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()), cub::Int2Type::value>{}); thrust::device_vector d_expected_output(input); constexpr int tile_size = LogicalWarpThreads * ItemsPerThread; for (int warp_id = 0; warp_id < BlockThreads / LogicalWarpThreads; warp_id++) { const int warp_data_begin = tile_size * warp_id; const int warp_data_end = warp_data_begin + tile_size; thrust::reverse(d_expected_output.begin() + warp_data_begin, d_expected_output.begin() + warp_data_end); } AssertTrue(Compare(d_expected_output, output)); } template void Test() { static_assert(BlockThreads % LogicalWarpThreads == 0, "BlockThreads must be a multiple of LogicalWarpThreads"); const int warps_in_block = BlockThreads / LogicalWarpThreads; const int items_per_warp = LogicalWarpThreads * ItemsPerThread; const int items_per_block = items_per_warp * warps_in_block; thrust::device_vector input(items_per_block); thrust::device_vector output(items_per_block); TestStripedToBlocked(input, output); TestBlockedToStriped(input, output); TestScatterToStriped(input, output); } template void Test() { Test(); Test(); Test(); } template void Test() { Test(); Test(); } template void Test() { Test(); Test(); Test(); } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); Test<4>(); Test<16>(); Test<32>(); return 0; } cub-2.0.1/test/test_warp_load.cu000066400000000000000000000276251434614775400166350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; const int MAX_ITERATIONS = 30; template __global__ void kernel(InputIteratorT input, int *err) { using InputT = cub::detail::value_t; using WarpLoadT = WarpLoad; constexpr int warps_in_block = BlockThreads / WarpThreads; constexpr int tile_size = ItemsPerThread * WarpThreads; const int warp_id = static_cast(threadIdx.x) / WarpThreads; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; InputT reg[ItemsPerThread]; WarpLoadT(temp_storage[warp_id]).Load(input + warp_id * tile_size, reg); for (int item = 0; item < ItemsPerThread; item++) { const auto expected_value = static_cast(threadIdx.x * ItemsPerThread + item); if (reg[item] != expected_value) { printf("TID: %u; WID: %d; LID: %d: ITEM: %d/%d: %d != %d\n", threadIdx.x, warp_id, static_cast(threadIdx.x) % WarpThreads, item, ItemsPerThread, static_cast(reg[item]), static_cast(expected_value)); atomicAdd(err, 1); break; } } } template __global__ void kernel(int valid_items, InputIteratorT input, int *err) { using InputT = cub::detail::value_t; using WarpLoadT = WarpLoad; constexpr int warps_in_block = BlockThreads / WarpThreads; constexpr int tile_size = ItemsPerThread * WarpThreads; const int tid = static_cast(threadIdx.x); const int warp_id = tid / WarpThreads; const int lane_id = tid % WarpThreads; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; InputT reg[ItemsPerThread]; const auto oob_default = static_cast(valid_items); WarpLoadT(temp_storage[warp_id]) .Load(input + warp_id * tile_size, reg, valid_items, oob_default); for (int item = 0; item < ItemsPerThread; item++) { const auto expected_value = static_cast(tid * ItemsPerThread + item); const bool is_oob = LoadAlgorithm == WarpLoadAlgorithm::WARP_LOAD_STRIPED ? item * WarpThreads + lane_id >= valid_items : lane_id * ItemsPerThread + item >= valid_items; if (is_oob) { if (reg[item] != oob_default) { atomicAdd(err, 1); } } else if (reg[item] != expected_value) { atomicAdd(err, 1); } } } template void TestImplementation(InputIteratorT input) { thrust::device_vector err(1, 0); kernel <<<1, BlockThreads>>>(input, thrust::raw_pointer_cast(err.data())); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); const int errors_number = err[0]; const int expected_errors_number = 0; AssertEquals(errors_number, expected_errors_number); } template void TestImplementation(int valid_items, InputIteratorT input) { thrust::device_vector err(1, 0); kernel <<<1, BlockThreads>>>(valid_items, input, thrust::raw_pointer_cast(err.data())); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); const int errors_number = err[0]; const int expected_errors_number = 0; AssertEquals(errors_number, expected_errors_number); } template thrust::device_vector GenInput() { const int tile_size = WarpThreads * ItemsPerThread; const int total_warps = BlockThreads / WarpThreads; const int elements = total_warps * tile_size; thrust::device_vector input(elements); if (LoadAlgorithm == WarpLoadAlgorithm::WARP_LOAD_STRIPED) { thrust::host_vector h_input(elements); // In this case we need different stripe pattern, so the // items/threads parameters are swapped constexpr int fake_block_size = ItemsPerThread * (BlockThreads / WarpThreads); FillStriped(h_input.begin()); input = h_input; } else { thrust::sequence(input.begin(), input.end()); } return input; } template void TestPointer() { thrust::device_vector input = GenInput(); TestImplementation( thrust::raw_pointer_cast(input.data())); const unsigned int max_valid_items = WarpThreads * ItemsPerThread; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { const int valid_items = static_cast(RandomValue(max_valid_items)); TestImplementation( valid_items, thrust::raw_pointer_cast(input.data())); } } template void TestIterator() { thrust::device_vector input = GenInput(); TestImplementation( CacheModifiedInputIterator( thrust::raw_pointer_cast(input.data()))); const int max_valid_items = WarpThreads * ItemsPerThread; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { const int valid_items = RandomValue(max_valid_items); TestImplementation( valid_items, CacheModifiedInputIterator( thrust::raw_pointer_cast(input.data()))); } } template void TestIterator() { TestIterator(); TestIterator(); TestIterator(); TestIterator(); TestIterator(); TestIterator(); TestIterator(); } template void Test() { TestPointer(); TestIterator(); } template void Test() { Test(); Test(); Test(); Test(); } template void Test() { Test(); Test(); Test(); } template void Test() { Test(); Test(); Test(); } template void Test() { Test(); Test(); Test(); } int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); Test<256>(); } cub-2.0.1/test/test_warp_mask.cu000066400000000000000000000100371434614775400166360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include bool IsLaneInvolved(unsigned int member_mask, unsigned int lane) { return member_mask & (1 << lane); } template void Test() { constexpr bool is_pow_of_two = cub::PowerOfTwo::VALUE; constexpr unsigned int warp_threads = 32; constexpr unsigned int warps = warp_threads / LOGICAL_WARP_THREADS; for (unsigned int warp_id = 0; warp_id < warps; warp_id++) { const unsigned int warp_mask = cub::WarpMask(warp_id); const unsigned int warp_begin = LOGICAL_WARP_THREADS * warp_id; const unsigned int warp_end = warp_begin + LOGICAL_WARP_THREADS; if (is_pow_of_two) { for (unsigned int prev_warp_lane = 0; prev_warp_lane < warp_begin; prev_warp_lane++) { AssertEquals(IsLaneInvolved(warp_mask, prev_warp_lane), false); } for (unsigned int warp_lane = warp_begin; warp_lane < warp_end; warp_lane++) { AssertEquals(IsLaneInvolved(warp_mask, warp_lane), true); } for (unsigned int post_warp_lane = warp_end; post_warp_lane < warp_threads; post_warp_lane++) { AssertEquals(IsLaneInvolved(warp_mask, post_warp_lane), false); } } else { for (unsigned int warp_lane = 0; warp_lane < LOGICAL_WARP_THREADS; warp_lane++) { AssertEquals(IsLaneInvolved(warp_mask, warp_lane), true); } for (unsigned int warp_lane = LOGICAL_WARP_THREADS; warp_lane < warp_threads; warp_lane++) { AssertEquals(IsLaneInvolved(warp_mask, warp_lane), false); } } } } void TestPowersOfTwo() { Test<1>(); Test<2>(); Test<4>(); Test<8>(); Test<16>(); Test<32>(); } void TestNonPowersOfTwo() { Test<3>(); Test<5>(); Test<6>(); Test<7>(); Test<9>(); Test<10>(); Test<11>(); Test<12>(); Test<13>(); Test<14>(); Test<15>(); Test<17>(); Test<18>(); Test<19>(); Test<20>(); Test<21>(); Test<22>(); Test<23>(); Test<24>(); Test<25>(); Test<26>(); Test<27>(); Test<28>(); Test<29>(); Test<30>(); Test<31>(); } int main() { TestPowersOfTwo(); TestNonPowersOfTwo(); } cub-2.0.1/test/test_warp_merge_sort.cu000066400000000000000000000467251434614775400200660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of WarpMergeSort utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; struct CustomType { std::uint8_t key; std::uint64_t count; __device__ __host__ CustomType() : key(0) , count(0) {} __device__ __host__ CustomType(std::uint64_t value) : key(static_cast(value)) , count(value) {} __device__ __host__ void operator=(std::uint64_t value) { key = static_cast(value); count = value; } }; struct CustomLess { template __device__ bool operator()(DataType &lhs, DataType &rhs) { return lhs < rhs; } __device__ bool operator()(CustomType &lhs, CustomType &rhs) { return lhs.key < rhs.key; } }; template < typename DataType, unsigned int ThreadsInBlock, unsigned int ThreadsInWarp, unsigned int ItemsPerThread, bool Stable = false> __global__ void WarpMergeSortTestKernel(unsigned int valid_segments, DataType *data, const unsigned int *segment_sizes) { using WarpMergeSortT = cub::WarpMergeSort; constexpr unsigned int WarpsInBlock = ThreadsInBlock / ThreadsInWarp; const unsigned int segment_id = threadIdx.x / ThreadsInWarp; if (segment_id >= valid_segments) { // Test case of partially finished CTA return; } __shared__ typename WarpMergeSortT::TempStorage temp_storage[WarpsInBlock]; WarpMergeSortT warp_sort(temp_storage[segment_id]); DataType thread_data[ItemsPerThread]; const unsigned int thread_offset = ThreadsInWarp * ItemsPerThread * segment_id + warp_sort.get_linear_tid() * ItemsPerThread; const unsigned int valid_items = segment_sizes[segment_id]; for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; thread_data[item] = item < valid_items ? data[idx] : DataType(); } WARP_SYNC(warp_sort.get_member_mask()); // Tests below use sequence to fill the data. // Therefore the following value should be greater than any that // is present in the input data. const DataType oob_default = static_cast(ThreadsInBlock * ItemsPerThread + 1); if (Stable) { if (valid_items == ThreadsInBlock * ItemsPerThread) { warp_sort.StableSort( thread_data, CustomLess()); } else { warp_sort.StableSort( thread_data, CustomLess(), valid_items, oob_default); } } else { if (valid_items == ThreadsInBlock * ItemsPerThread) { warp_sort.Sort( thread_data, CustomLess()); } else { warp_sort.Sort( thread_data, CustomLess(), valid_items, oob_default); } } for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; if (item >= valid_items) break; data[idx] = thread_data[item]; } } template< typename DataType, unsigned int ThreadsInBlock, unsigned int ThreadsInWarp, unsigned int ItemsPerThread, bool Stable> void WarpMergeSortTest( unsigned int valid_segments, DataType *data, const unsigned int *segment_sizes) { WarpMergeSortTestKernel <<<1, ThreadsInBlock>>>(valid_segments, data, segment_sizes); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template __global__ void WarpMergeSortTestKernel(unsigned int valid_segments, KeyType *keys, ValueType *values, const unsigned int *segment_sizes) { using WarpMergeSortT = cub::WarpMergeSort; constexpr unsigned int WarpsInBlock = ThreadsInBlock / ThreadsInWarp; const unsigned int segment_id = threadIdx.x / ThreadsInWarp; if (segment_id >= valid_segments) { // Test case of partially finished CTA return; } __shared__ typename WarpMergeSortT::TempStorage temp_storage[WarpsInBlock]; WarpMergeSortT warp_sort(temp_storage[segment_id]); KeyType thread_keys[ItemsPerThread]; ValueType thread_values[ItemsPerThread]; const unsigned int thread_offset = ThreadsInWarp * ItemsPerThread * segment_id + warp_sort.get_linear_tid() * ItemsPerThread; const unsigned int valid_items = segment_sizes[segment_id]; for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; thread_keys[item] = item < valid_items ? keys[idx] : KeyType(); thread_values[item] = item < valid_items ? values[idx] : ValueType(); } WARP_SYNC(warp_sort.get_member_mask()); // Tests below use sequence to fill the data. // Therefore the following value should be greater than any that // is present in the input data. const KeyType oob_default = static_cast(ThreadsInBlock * ItemsPerThread + 1); if (Stable) { if (valid_items == ThreadsInBlock * ItemsPerThread) { warp_sort.StableSort(thread_keys, thread_values, CustomLess()); } else { warp_sort.StableSort(thread_keys, thread_values, CustomLess(), valid_items, oob_default); } } else { if (valid_items == ThreadsInBlock * ItemsPerThread) { warp_sort.Sort(thread_keys, thread_values, CustomLess()); } else { warp_sort.Sort(thread_keys, thread_values, CustomLess(), valid_items, oob_default); } } for (unsigned int item = 0; item < ItemsPerThread; item++) { const unsigned int idx = thread_offset + item; if (item >= valid_items) break; keys[idx] = thread_keys[item]; values[idx] = thread_values[item]; } } template void WarpMergeSortTest(unsigned int valid_segments, KeyType *keys, ValueType *values, const unsigned int *segment_sizes) { WarpMergeSortTestKernel <<<1, ThreadsInBlock>>>(valid_segments, keys, values, segment_sizes); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template bool CheckResult(unsigned int valid_segments, thrust::device_vector &d_data, thrust::host_vector &h_data, const thrust::host_vector &segment_sizes) { thrust::copy(d_data.begin(), d_data.end(), h_data.begin()); constexpr unsigned int max_segment_size = ThreadsInWarp * ItemsPerThread; for (unsigned int segment_id = 0; segment_id < valid_segments; segment_id++) { unsigned int segment_size = segment_sizes[segment_id]; for (unsigned int i = 0; i < segment_size; i++) { const auto actual_value = h_data[max_segment_size * segment_id + i]; const auto expected_value = static_cast(i); if (actual_value != expected_value) { return false; } } } return true; } template < typename DataType, unsigned int ThreadsInBlock, unsigned int ThreadsInWarp, unsigned int ItemsPerThread, bool Stable> void Test(unsigned int valid_segments, thrust::default_random_engine &rng, thrust::device_vector &d_data, thrust::host_vector &h_data, const thrust::host_vector &h_segment_sizes, const thrust::device_vector &d_segment_sizes) { thrust::fill(d_data.begin(), d_data.end(), DataType{}); constexpr unsigned int max_segment_size = ThreadsInWarp * ItemsPerThread; for (unsigned int segment_id = 0; segment_id < valid_segments; segment_id++) { const unsigned int segment_offset = max_segment_size * segment_id; const unsigned int segment_size = h_segment_sizes[segment_id]; auto segment_begin = d_data.begin() + segment_offset; auto segment_end = segment_begin + segment_size; thrust::sequence(segment_begin, segment_end); thrust::shuffle(segment_begin, segment_end, rng); } WarpMergeSortTest(valid_segments, thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(d_segment_sizes.data())); const bool check = CheckResult(valid_segments, d_data, h_data, h_segment_sizes); AssertTrue(check); } template struct Cast { __device__ __host__ DestType operator()(const SourceType val) { return static_cast(val); } }; template < typename KeyType, typename ValueType, unsigned int ThreadsInBlock, unsigned int ThreadsInWarp, unsigned int ItemsPerThread, bool Stable> void Test(unsigned int valid_segments, thrust::default_random_engine &rng, thrust::device_vector &d_keys, thrust::host_vector &h_keys, thrust::device_vector &d_values, thrust::host_vector &h_values, const thrust::host_vector &h_segment_sizes, const thrust::device_vector &d_segment_sizes) { thrust::fill(d_keys.begin(), d_keys.end(), KeyType{}); thrust::fill(d_values.begin(), d_values.end(), ValueType{}); constexpr unsigned int max_segment_size = ThreadsInWarp * ItemsPerThread; for (unsigned int segment_id = 0; segment_id < valid_segments; segment_id++) { const unsigned int segment_offset = max_segment_size * segment_id; const unsigned int segment_size = h_segment_sizes[segment_id]; auto segment_begin = d_keys.begin() + segment_offset; auto segment_end = segment_begin + segment_size; thrust::sequence(segment_begin, segment_end); thrust::shuffle(segment_begin, segment_end, rng); thrust::transform(segment_begin, segment_end, d_values.begin() + segment_offset, Cast{}); } WarpMergeSortTest(valid_segments, thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_values.data()), thrust::raw_pointer_cast(d_segment_sizes.data())); const bool keys_ok = CheckResult(valid_segments, d_keys, h_keys, h_segment_sizes); const bool values_ok = CheckResult(valid_segments, d_values, h_values, h_segment_sizes); AssertTrue(keys_ok); AssertTrue(values_ok); } template < typename KeyType, typename ValueType, unsigned int ThreadsInBlock, unsigned int ThreadsInWarp, unsigned int ItemsPerThread, bool Stable> void Test(thrust::default_random_engine &rng) { constexpr unsigned int max_segments = ThreadsInBlock / ThreadsInWarp; constexpr unsigned int max_segment_size = ThreadsInWarp * ItemsPerThread; thrust::device_vector h_segment_sizes_set(max_segment_size); thrust::sequence(h_segment_sizes_set.begin(), h_segment_sizes_set.end()); thrust::device_vector h_segment_sizes; for (unsigned int segment_id = 0; segment_id < max_segments; segment_id++) { h_segment_sizes.insert(h_segment_sizes.end(), h_segment_sizes_set.begin(), h_segment_sizes_set.end()); } thrust::device_vector d_segment_sizes(h_segment_sizes); thrust::device_vector d_keys(max_segments * max_segment_size); thrust::device_vector d_values(max_segments * max_segment_size); thrust::host_vector h_keys(max_segments * max_segment_size); thrust::host_vector h_values(max_segments * max_segment_size); for (unsigned int valid_segments = 1; valid_segments < max_segments; valid_segments += 3) { thrust::shuffle(h_segment_sizes.begin(), h_segment_sizes.end(), rng); thrust::copy(h_segment_sizes.begin(), h_segment_sizes.end(), d_segment_sizes.begin()); Test( valid_segments, rng, d_keys, h_keys, h_segment_sizes, d_segment_sizes); Test( valid_segments, rng, d_keys, h_keys, d_values, h_values, h_segment_sizes, d_segment_sizes); } } template void Test(thrust::default_random_engine &rng) { Test(rng); Test(rng); // Mixed types Test(rng); Test(rng); } template < unsigned int ThreadsInWarp, unsigned int ItemsPerThread, bool Stable> void Test(thrust::default_random_engine &rng) { Test<32, ThreadsInWarp, ItemsPerThread, Stable>(rng); Test<64, ThreadsInWarp, ItemsPerThread, Stable>(rng); } template void Test(thrust::default_random_engine &rng) { Test<1, ItemsPerThread, Stable>(rng); Test<2, ItemsPerThread, Stable>(rng); Test<4, ItemsPerThread, Stable>(rng); Test<8, ItemsPerThread, Stable>(rng); Test<16, ItemsPerThread, Stable>(rng); Test<32, ItemsPerThread, Stable>(rng); } template void Test(thrust::default_random_engine &rng) { // Check TestStability for stable = true constexpr bool unstable = false; Test(rng); } struct CountToType { __device__ __host__ CustomType operator()(std::uint64_t val) { return { val }; } }; struct CountComparator { __device__ __host__ bool operator()(const CustomType &lhs, const CustomType &rhs) { if (lhs.key == rhs.key) return lhs.count < rhs.count; return lhs.key < rhs.key; } }; template void TestStability() { constexpr unsigned int items_per_thread = 10; constexpr unsigned int threads_per_block = 128; constexpr unsigned int valid_segments = 1; constexpr unsigned int elements = items_per_thread * ThreadsInWarp; constexpr bool stable = true; thrust::device_vector d_keys(elements); thrust::device_vector d_counts(elements); thrust::sequence(d_counts.begin(), d_counts.end(), std::uint64_t{}, std::uint64_t{128}); thrust::transform(d_counts.begin(), d_counts.end(), d_keys.begin(), CountToType{}); thrust::device_vector d_segment_sizes(valid_segments, elements); // Sort keys WarpMergeSortTest(valid_segments, thrust::raw_pointer_cast(d_keys.data()), thrust::raw_pointer_cast(d_segment_sizes.data())); // Check counts AssertTrue(thrust::is_sorted(d_keys.begin(), d_keys.end(), CountComparator{})); } int main(int argc, char** argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); thrust::default_random_engine rng; Test<2>(rng); Test<7>(rng); TestStability<4>(); TestStability<32>(); return 0; } cub-2.0.1/test/test_warp_reduce.cu000066400000000000000000000646051434614775400171640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of WarpReduce utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ template< typename OpT, int LOGICAL_WARP_THREADS> struct WrapperFunctor { OpT op; int num_valid; inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {} template inline __host__ __device__ T operator()(const T &a, const T &b) const { NV_IF_TARGET(NV_IS_DEVICE, ( if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid) { _CubLog("%s\n", "Invalid lane ID in cub::WrapperFunctor::operator()"); cub::ThreadTrap(); } )); return static_cast(op(a, b)); } }; //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * Generic reduction */ template < typename T, typename ReductionOp, typename WarpReduce, bool PRIMITIVE = Traits::PRIMITIVE> struct DeviceTest { static __device__ __forceinline__ T Reduce( typename WarpReduce::TempStorage &temp_storage, T &data, ReductionOp &reduction_op) { return WarpReduce(temp_storage).Reduce(data, reduction_op); } static __device__ __forceinline__ T Reduce( typename WarpReduce::TempStorage &temp_storage, T &data, ReductionOp &reduction_op, const int &valid_warp_threads) { return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads); } template static __device__ __forceinline__ T HeadSegmentedReduce( typename WarpReduce::TempStorage &temp_storage, T &data, FlagT &flag, ReductionOp &reduction_op) { return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op); } template static __device__ __forceinline__ T TailSegmentedReduce( typename WarpReduce::TempStorage &temp_storage, T &data, FlagT &flag, ReductionOp &reduction_op) { return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op); } }; /** * Summation */ template < typename T, typename WarpReduce> struct DeviceTest { static __device__ __forceinline__ T Reduce( typename WarpReduce::TempStorage &temp_storage, T &data, Sum &reduction_op) { return WarpReduce(temp_storage).Sum(data); } static __device__ __forceinline__ T Reduce( typename WarpReduce::TempStorage &temp_storage, T &data, Sum &reduction_op, const int &valid_warp_threads) { return WarpReduce(temp_storage).Sum(data, valid_warp_threads); } template static __device__ __forceinline__ T HeadSegmentedReduce( typename WarpReduce::TempStorage &temp_storage, T &data, FlagT &flag, Sum &reduction_op) { return WarpReduce(temp_storage).HeadSegmentedSum(data, flag); } template static __device__ __forceinline__ T TailSegmentedReduce( typename WarpReduce::TempStorage &temp_storage, T &data, FlagT &flag, Sum &reduction_op) { return WarpReduce(temp_storage).TailSegmentedSum(data, flag); } }; /** * Full-tile warp reduction kernel */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename ReductionOp> __global__ void FullWarpReduceKernel( T *d_in, T *d_out, ReductionOp reduction_op, clock_t *d_elapsed) { // Cooperative warp-reduce utility type (1 warp) typedef WarpReduce WarpReduce; // Allocate temp storage in shared memory __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; // Per-thread tile data T input = d_in[threadIdx.x]; // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Test warp reduce int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; T output = DeviceTest::Reduce( temp_storage[warp_id], input, reduction_op); // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting *d_elapsed = stop - start; // Store aggregate d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ? output : input; } /** * Partially-full warp reduction kernel */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename ReductionOp> __global__ void PartialWarpReduceKernel( T *d_in, T *d_out, ReductionOp reduction_op, clock_t *d_elapsed, int valid_warp_threads) { // Cooperative warp-reduce utility type typedef WarpReduce WarpReduce; // Allocate temp storage in shared memory __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; // Per-thread tile data T input = d_in[threadIdx.x]; // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Test partial-warp reduce int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; T output = DeviceTest::Reduce( temp_storage[warp_id], input, reduction_op, valid_warp_threads); // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting *d_elapsed = stop - start; // Store aggregate d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ? output : input; } /** * Head-based segmented warp reduction test kernel */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename FlagT, typename ReductionOp> __global__ void WarpHeadSegmentedReduceKernel( T *d_in, FlagT *d_head_flags, T *d_out, ReductionOp reduction_op, clock_t *d_elapsed) { // Cooperative warp-reduce utility type typedef WarpReduce WarpReduce; // Allocate temp storage in shared memory __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; // Per-thread tile data T input = d_in[threadIdx.x]; FlagT head_flag = d_head_flags[threadIdx.x]; // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Test segmented warp reduce int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; T output = DeviceTest::HeadSegmentedReduce( temp_storage[warp_id], input, head_flag, reduction_op); // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting *d_elapsed = stop - start; // Store aggregate d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ? output : input; } /** * Tail-based segmented warp reduction test kernel */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename FlagT, typename ReductionOp> __global__ void WarpTailSegmentedReduceKernel( T *d_in, FlagT *d_tail_flags, T *d_out, ReductionOp reduction_op, clock_t *d_elapsed) { // Cooperative warp-reduce utility type typedef WarpReduce WarpReduce; // Allocate temp storage in shared memory __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; // Per-thread tile data T input = d_in[threadIdx.x]; FlagT tail_flag = d_tail_flags[threadIdx.x]; FlagT head_flag = (threadIdx.x == 0) ? 0 : d_tail_flags[threadIdx.x - 1]; // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Test segmented warp reduce int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; T output = DeviceTest::TailSegmentedReduce( temp_storage[warp_id], input, tail_flag, reduction_op); // Record elapsed clocks __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting *d_elapsed = stop - start; // Store aggregate d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ? output : input; } //--------------------------------------------------------------------- // Host utility subroutines //--------------------------------------------------------------------- /** * Initialize reduction problem (and solution) */ template < typename T, typename ReductionOp> void Initialize( GenMode gen_mode, int flag_entropy, T *h_in, int *h_flags, int warps, int warp_threads, int valid_warp_threads, ReductionOp reduction_op, T *h_head_out, T *h_tail_out) { for (int i = 0; i < warps * warp_threads; ++i) { // Sample a value for this item InitValue(gen_mode, h_in[i], i); h_head_out[i] = h_in[i]; h_tail_out[i] = h_in[i]; // Sample whether or not this item will be a segment head char bits; RandomBits(bits, flag_entropy); h_flags[i] = bits & 0x1; } h_flags[warps * warp_threads] = {}; h_tail_out[warps * warp_threads] = {}; // Accumulate segments (lane 0 of each warp is implicitly a segment head) for (int warp = 0; warp < warps; ++warp) { int warp_offset = warp * warp_threads; int item_offset = warp_offset + valid_warp_threads - 1; // Last item in warp T head_aggregate = h_in[item_offset]; T tail_aggregate = h_in[item_offset]; if (h_flags[item_offset]) h_head_out[item_offset] = head_aggregate; item_offset--; // Work backwards while (item_offset >= warp_offset) { if (h_flags[item_offset + 1]) { head_aggregate = h_in[item_offset]; } else { head_aggregate = static_cast(reduction_op(head_aggregate, h_in[item_offset])); } if (h_flags[item_offset]) { h_head_out[item_offset] = head_aggregate; h_tail_out[item_offset + 1] = tail_aggregate; tail_aggregate = h_in[item_offset]; } else { tail_aggregate = static_cast(reduction_op(tail_aggregate, h_in[item_offset])); } item_offset--; } // Record last segment head_aggregate to head offset h_head_out[warp_offset] = head_aggregate; h_tail_out[warp_offset] = tail_aggregate; } } /** * Test warp reduction */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename ReductionOp> void TestReduce( GenMode gen_mode, ReductionOp reduction_op, int valid_warp_threads = LOGICAL_WARP_THREADS) { const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS; // Allocate host arrays T *h_in = new T[BLOCK_THREADS]; int *h_flags = new int[BLOCK_THREADS + 1]; T *h_out = new T[BLOCK_THREADS]; T *h_tail_out = new T[BLOCK_THREADS + 1]; // Initialize problem Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out); // Initialize/clear device arrays T *d_in = NULL; T *d_out = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS)); if (g_verbose) { printf("Data:\n"); for (int i = 0; i < WARPS; ++i) DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads); } // Run kernel printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n", gen_mode, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, typeid(T).name(), (int) sizeof(T)); fflush(stdout); if (valid_warp_threads == LOGICAL_WARP_THREADS) { // Run full-warp kernel FullWarpReduceKernel<<<1, BLOCK_THREADS>>>( d_in, d_out, reduction_op, d_elapsed); } else { // Run partial-warp kernel PartialWarpReduceKernel<<<1, BLOCK_THREADS>>>( d_in, d_out, reduction_op, d_elapsed, valid_warp_threads); } CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tReduction results: "); int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Cleanup if (h_in) delete[] h_in; if (h_flags) delete[] h_flags; if (h_out) delete[] h_out; if (h_tail_out) delete[] h_tail_out; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Test warp segmented reduction */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename ReductionOp> void TestSegmentedReduce( GenMode gen_mode, int flag_entropy, ReductionOp reduction_op) { const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS; // Allocate host arrays int compare; T *h_in = new T[BLOCK_THREADS]; int *h_flags = new int[BLOCK_THREADS + 1]; T *h_head_out = new T[BLOCK_THREADS + 1]; T *h_tail_out = new T[BLOCK_THREADS + 1]; // Initialize problem Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out); // Initialize/clear device arrays T *d_in = NULL; int *d_flags = NULL; T *d_head_out = NULL; T *d_tail_out = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS)); CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS)); if (g_verbose) { printf("Data:\n"); for (int i = 0; i < WARPS; ++i) DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS); printf("\nFlags:\n"); for (int i = 0; i < WARPS; ++i) DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS); } printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n", gen_mode, flag_entropy, WARPS, LOGICAL_WARP_THREADS, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Run head-based kernel WarpHeadSegmentedReduceKernel<<<1, BLOCK_THREADS>>>( d_in, d_flags, d_head_out, reduction_op, d_elapsed); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tHead-based segmented reduction results: "); compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Run tail-based kernel WarpTailSegmentedReduceKernel<<<1, BLOCK_THREADS>>>( d_in, d_flags, d_tail_out, reduction_op, d_elapsed); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tTail-based segmented reduction results: "); compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Cleanup if (h_in) delete[] h_in; if (h_flags) delete[] h_flags; if (h_head_out) delete[] h_head_out; if (h_tail_out) delete[] h_tail_out; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags)); if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out)); if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Run battery of tests for different full and partial tile sizes */ template < int WARPS, int LOGICAL_WARP_THREADS, typename T, typename ReductionOp> void Test( GenMode gen_mode, ReductionOp reduction_op) { // Partial tiles for ( int valid_warp_threads = 1; valid_warp_threads < LOGICAL_WARP_THREADS; valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5)) { // Without wrapper (to test non-excepting PTX POD-op specializations) TestReduce(gen_mode, reduction_op, valid_warp_threads); // With wrapper to ensure no ops called on OOB lanes WrapperFunctor wrapped_op(reduction_op, valid_warp_threads); TestReduce(gen_mode, wrapped_op, valid_warp_threads); } // Full tile TestReduce(gen_mode, reduction_op, LOGICAL_WARP_THREADS); // Segmented reduction with different head flags for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy) { TestSegmentedReduce(gen_mode, flag_entropy, reduction_op); } } /** * Run battery of tests for different data types and reduce ops */ template < int WARPS, int LOGICAL_WARP_THREADS> void Test(GenMode gen_mode) { // primitive Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); if (gen_mode != RANDOM) { Test( gen_mode, Sum()); Test( gen_mode, Sum()); } // primitive (alternative reduce op) Test( gen_mode, Max()); Test( gen_mode, Max()); Test( gen_mode, Max()); Test( gen_mode, Max()); // vec-1 Test( gen_mode, Sum()); // vec-2 Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); // vec-4 Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); // complex Test( gen_mode, Sum()); Test( gen_mode, Sum()); } /** * Run battery of tests for different problem generation options */ template < int WARPS, int LOGICAL_WARP_THREADS> void Test() { Test(UNIFORM); Test(INTEGER_SEED); Test(RANDOM); } /** * Run battery of tests for different number of active warps */ template void Test() { Test<1, LOGICAL_WARP_THREADS>(); // Only power-of-two subwarps can be tiled if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo::VALUE) Test<2, LOGICAL_WARP_THREADS>(); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Test logical warp sizes Test<32>(); Test<16>(); Test<9>(); Test<7>(); Test<1>(); return 0; } cub-2.0.1/test/test_warp_scan.cu000066400000000000000000000464431434614775400166410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of WarpScan utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- static const int NUM_WARPS = 2; bool g_verbose = false; CachingDeviceAllocator g_allocator(true); /** * Primitive variant to test */ enum TestMode { BASIC, AGGREGATE, }; /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ template struct WrapperFunctor { OpT op; WrapperFunctor(OpT op) : op(op) {} template __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const { return static_cast(op(a, b)); } }; //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /// Exclusive scan basic template __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, T &initial_value, ScanOpT &scan_op, T &aggregate, Int2Type test_mode, IsPrimitiveT is_primitive) { // Test basic warp scan warp_scan.ExclusiveScan(data, data, initial_value, scan_op); } /// Exclusive scan aggregate template < typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, T &initial_value, ScanOpT &scan_op, T &aggregate, Int2Type test_mode, IsPrimitiveT is_primitive) { // Test with cumulative aggregate warp_scan.ExclusiveScan(data, data, initial_value, scan_op, aggregate); } /// Exclusive sum basic template < typename WarpScanT, typename T> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, T &initial_value, Sum &scan_op, T &aggregate, Int2Type test_mode, Int2Type is_primitive) { // Test basic warp scan warp_scan.ExclusiveSum(data, data); } /// Exclusive sum aggregate template < typename WarpScanT, typename T> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, T &initial_value, Sum &scan_op, T &aggregate, Int2Type test_mode, Int2Type is_primitive) { // Test with cumulative aggregate warp_scan.ExclusiveSum(data, data, aggregate); } /// Inclusive scan basic template < typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, NullType &initial_value, ScanOpT &scan_op, T &aggregate, Int2Type test_mode, IsPrimitiveT is_primitive) { // Test basic warp scan warp_scan.InclusiveScan(data, data, scan_op); } /// Inclusive scan aggregate template < typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, NullType &initial_value, ScanOpT &scan_op, T &aggregate, Int2Type test_mode, IsPrimitiveT is_primitive) { // Test with cumulative aggregate warp_scan.InclusiveScan(data, data, scan_op, aggregate); } /// Inclusive sum basic template < typename WarpScanT, typename T, typename InitialValueT> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, NullType &initial_value, Sum &scan_op, T &aggregate, Int2Type test_mode, Int2Type is_primitive) { // Test basic warp scan warp_scan.InclusiveSum(data, data); } /// Inclusive sum aggregate template < typename WarpScanT, typename T, typename InitialValueT> __device__ __forceinline__ void DeviceTest( WarpScanT &warp_scan, T &data, NullType &initial_value, Sum &scan_op, T &aggregate, Int2Type test_mode, Int2Type is_primitive) { // Test with cumulative aggregate warp_scan.InclusiveSum(data, data, aggregate); } /** * WarpScan test kernel */ template < int LOGICAL_WARP_THREADS, TestMode TEST_MODE, typename T, typename ScanOpT, typename InitialValueT> __global__ void WarpScanKernel( T *d_in, T *d_out, T *d_aggregate, ScanOpT scan_op, InitialValueT initial_value, clock_t *d_elapsed) { // Cooperative warp-scan utility type (1 warp) typedef WarpScan WarpScanT; // Allocate temp storage in shared memory __shared__ typename WarpScanT::TempStorage temp_storage[NUM_WARPS]; // Get warp index int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; // Per-thread tile data T data = d_in[threadIdx.x]; // Start cycle timer __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting T aggregate; // Test scan WarpScanT warp_scan(temp_storage[warp_id]); DeviceTest( warp_scan, data, initial_value, scan_op, aggregate, Int2Type(), Int2Type::PRIMITIVE>()); // Stop cycle timer __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Store data d_out[threadIdx.x] = data; if (TEST_MODE != BASIC) { // Store aggregate d_aggregate[threadIdx.x] = aggregate; } // Store time if (threadIdx.x == 0) { *d_elapsed = (start > stop) ? start - stop : stop - start; } } //--------------------------------------------------------------------- // Host utility subroutines //--------------------------------------------------------------------- /** * Initialize exclusive-scan problem (and solution) */ template < typename T, typename ScanOpT> void Initialize( GenMode gen_mode, T *h_in, T *h_reference, int logical_warp_items, ScanOpT scan_op, T initial_value, T warp_aggregates[NUM_WARPS]) { for (int w = 0; w < NUM_WARPS; ++w) { int base_idx = (w * logical_warp_items); int i = base_idx; InitValue(gen_mode, h_in[i], i); T warp_aggregate = h_in[i]; h_reference[i] = initial_value; T inclusive = static_cast(scan_op(initial_value, h_in[i])); for (i = i + 1; i < base_idx + logical_warp_items; ++i) { InitValue(gen_mode, h_in[i], i); h_reference[i] = inclusive; inclusive = static_cast(scan_op(inclusive, h_in[i])); warp_aggregate = static_cast(scan_op(warp_aggregate, h_in[i])); } warp_aggregates[w] = warp_aggregate; } } /** * Initialize inclusive-scan problem (and solution) */ template < typename T, typename ScanOpT> void Initialize( GenMode gen_mode, T *h_in, T *h_reference, int logical_warp_items, ScanOpT scan_op, NullType, T warp_aggregates[NUM_WARPS]) { for (int w = 0; w < NUM_WARPS; ++w) { int base_idx = (w * logical_warp_items); int i = base_idx; InitValue(gen_mode, h_in[i], i); T warp_aggregate = h_in[i]; T inclusive = h_in[i]; h_reference[i] = inclusive; for (i = i + 1; i < base_idx + logical_warp_items; ++i) { InitValue(gen_mode, h_in[i], i); inclusive = static_cast(scan_op(inclusive, h_in[i])); warp_aggregate = static_cast(scan_op(warp_aggregate, h_in[i])); h_reference[i] = inclusive; } warp_aggregates[w] = warp_aggregate; } } /** * Test warp scan */ template < int LOGICAL_WARP_THREADS, TestMode TEST_MODE, typename T, typename ScanOpT, typename InitialValueT> // NullType implies inclusive-scan, otherwise inclusive scan void Test( GenMode gen_mode, ScanOpT scan_op, InitialValueT initial_value) { enum { TOTAL_ITEMS = LOGICAL_WARP_THREADS * NUM_WARPS, }; // Allocate host arrays T *h_in = new T[TOTAL_ITEMS]; T *h_reference = new T[TOTAL_ITEMS]; T *h_aggregate = new T[TOTAL_ITEMS]; // Initialize problem T aggregates[NUM_WARPS]; Initialize( gen_mode, h_in, h_reference, LOGICAL_WARP_THREADS, scan_op, initial_value, aggregates); if (g_verbose) { printf("Input: \n"); DisplayResults(h_in, TOTAL_ITEMS); printf("\n"); } for (int w = 0; w < NUM_WARPS; ++w) { for (int i = 0; i < LOGICAL_WARP_THREADS; ++i) { h_aggregate[(w * LOGICAL_WARP_THREADS) + i] = aggregates[w]; } } // Initialize/clear device arrays T *d_in = NULL; T *d_out = NULL; T *d_aggregate = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TOTAL_ITEMS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TOTAL_ITEMS + 1))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * TOTAL_ITEMS)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TOTAL_ITEMS, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TOTAL_ITEMS + 1))); CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * TOTAL_ITEMS)); // Run kernel printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n", TEST_MODE, typeid(TEST_MODE).name(), gen_mode, typeid(gen_mode).name(), (std::is_same::value) ? "Inclusive" : "Exclusive", LOGICAL_WARP_THREADS, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Run aggregate/prefix kernel WarpScanKernel<<<1, TOTAL_ITEMS>>>( d_in, d_out, d_aggregate, scan_op, initial_value, d_elapsed); printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tScan results: "); int compare = CompareDeviceResults(h_reference, d_out, TOTAL_ITEMS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Copy out and display aggregate if (TEST_MODE == AGGREGATE) { printf("\tScan aggregate: "); compare = CompareDeviceResults(h_aggregate, d_aggregate, TOTAL_ITEMS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_aggregate) delete[] h_aggregate; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Run battery of tests for different primitive variants */ template < int LOGICAL_WARP_THREADS, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { // Exclusive Test(gen_mode, scan_op, T()); Test(gen_mode, scan_op, T()); // Exclusive (non-specialized, so we can use initial-value) Test(gen_mode, WrapperFunctor(scan_op), initial_value); Test(gen_mode, WrapperFunctor(scan_op), initial_value); // Inclusive Test(gen_mode, scan_op, NullType()); Test(gen_mode, scan_op, NullType()); } /** * Run battery of tests for different data types and scan ops */ template void Test(GenMode gen_mode) { // Get device ordinal int device_ordinal; CubDebugExit(cudaGetDevice(&device_ordinal)); // Get ptx version int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); // primitive Test(gen_mode, Sum(), (char) 99); Test(gen_mode, Sum(), (short) 99); Test(gen_mode, Sum(), (int) 99); Test(gen_mode, Sum(), (long) 99); Test(gen_mode, Sum(), (long long) 99); if (gen_mode != RANDOM) { // Only test numerically stable inputs Test(gen_mode, Sum(), (float) 99); if (ptx_version > 100) Test(gen_mode, Sum(), (double) 99); } // primitive (alternative scan op) Test(gen_mode, Max(), (unsigned char) 99); Test(gen_mode, Max(), (unsigned short) 99); Test(gen_mode, Max(), (unsigned int) 99); Test(gen_mode, Max(), (unsigned long long) 99); // vec-2 Test(gen_mode, Sum(), make_uchar2(17, 21)); Test(gen_mode, Sum(), make_ushort2(17, 21)); Test(gen_mode, Sum(), make_uint2(17, 21)); Test(gen_mode, Sum(), make_ulong2(17, 21)); Test(gen_mode, Sum(), make_ulonglong2(17, 21)); if (gen_mode != RANDOM) { // Only test numerically stable inputs Test(gen_mode, Sum(), make_float2(17, 21)); if (ptx_version > 100) Test(gen_mode, Sum(), make_double2(17, 21)); } // vec-4 Test(gen_mode, Sum(), make_char4(17, 21, 32, 85)); Test(gen_mode, Sum(), make_short4(17, 21, 32, 85)); Test(gen_mode, Sum(), make_int4(17, 21, 32, 85)); Test(gen_mode, Sum(), make_long4(17, 21, 32, 85)); Test(gen_mode, Sum(), make_longlong4(17, 21, 32, 85)); if (gen_mode != RANDOM) { // Only test numerically stable inputs Test(gen_mode, Sum(), make_float4(17, 21, 32, 85)); if (ptx_version > 100) Test(gen_mode, Sum(), make_double4(17, 21, 32, 85)); } // complex Test(gen_mode, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85)); Test(gen_mode, Sum(), TestBar(17, 21)); } /** * Run battery of tests for different problem generation options */ template void Test() { Test(UNIFORM); Test(INTEGER_SEED); Test(RANDOM); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Test logical warp sizes Test<32>(); Test<16>(); Test<9>(); Test<2>(); return 0; } cub-2.0.1/test/test_warp_store.cu000066400000000000000000000300301434614775400170320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; const int MAX_ITERATIONS = 30; template __global__ void kernel(OutputIteratorT output) { using WarpStoreT = WarpStore; constexpr int warps_in_block = BlockThreads / WarpThreads; constexpr int tile_size = ItemsPerThread * WarpThreads; const int warp_id = static_cast(threadIdx.x) / WarpThreads; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; OutputT reg[ItemsPerThread]; for (int item = 0; item < ItemsPerThread; item++) { reg[item] = static_cast(threadIdx.x * ItemsPerThread + item); } WarpStoreT(temp_storage[warp_id]).Store(output + warp_id * tile_size, reg); } template __global__ void kernel(int valid_items, OutputIteratorT output) { using WarpStoreT = WarpStore; constexpr int warps_in_block = BlockThreads / WarpThreads; constexpr int tile_size = ItemsPerThread * WarpThreads; const int tid = static_cast(threadIdx.x); const int warp_id = tid / WarpThreads; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; OutputT reg[ItemsPerThread]; for (int item = 0; item < ItemsPerThread; item++) { reg[item] = static_cast(threadIdx.x * ItemsPerThread + item); } WarpStoreT(temp_storage[warp_id]) .Store(output + warp_id * tile_size, reg, valid_items); } template thrust::device_vector GenExpectedOutput(int valid_items) { const int tile_size = WarpThreads * ItemsPerThread; const int total_warps = BlockThreads / WarpThreads; const int elements = total_warps * tile_size; thrust::device_vector input(elements); if (StoreAlgorithm == WarpStoreAlgorithm::WARP_STORE_STRIPED) { thrust::host_vector h_input(elements); // In this case we need different stripe pattern, so the // items/threads parameters are swapped constexpr int fake_block_size = ItemsPerThread * (BlockThreads / WarpThreads); FillStriped(h_input.begin()); input = h_input; } else { thrust::sequence(input.begin(), input.end()); } if (valid_items != elements) { for (int warp_id = 0; warp_id < total_warps; warp_id++) { thrust::fill(input.begin() + warp_id * tile_size + valid_items, input.begin() + (warp_id + 1) * tile_size, T{}); } } return input; } template < typename T, int BlockThreads, int WarpThreads, int ItemsPerThread, WarpStoreAlgorithm StoreAlgorithm> void CheckResults(int valid_items, const thrust::device_vector &output) { const thrust::device_vector expected_output = GenExpectedOutput(valid_items); AssertEquals(expected_output, output); } template void TestImplementation(OutputIteratorT output) { kernel<<<1, BlockThreads>>>(output); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template void TestImplementation(int valid_items, OutputIteratorT output) { kernel<<<1, BlockThreads>>>(valid_items, output); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); } template void TestPointer() { const int tile_size = WarpThreads * ItemsPerThread; const int total_warps = BlockThreads / WarpThreads; const int elements = total_warps * tile_size; thrust::device_vector output(elements); thrust::fill(output.begin(), output.end(), T{}); TestImplementation( thrust::raw_pointer_cast(output.data())); CheckResults( elements, output); const unsigned int max_valid_items = WarpThreads * ItemsPerThread; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { const int valid_items = static_cast(RandomValue(max_valid_items)); thrust::fill(output.begin(), output.end(), T{}); TestImplementation( valid_items, thrust::raw_pointer_cast(output.data())); CheckResults( valid_items, output); } } template void TestIterator() { const int tile_size = WarpThreads * ItemsPerThread; const int total_warps = BlockThreads / WarpThreads; const int elements = total_warps * tile_size; thrust::device_vector output(elements); thrust::fill(output.begin(), output.end(), T{}); TestImplementation( CacheModifiedOutputIterator( thrust::raw_pointer_cast(output.data()))); CheckResults( elements, output); const int max_valid_items = WarpThreads * ItemsPerThread; for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { const int valid_items = RandomValue(max_valid_items); thrust::fill(output.begin(), output.end(), T{}); TestImplementation( valid_items, CacheModifiedOutputIterator( thrust::raw_pointer_cast(output.data()))); CheckResults( valid_items, output); } } template void TestIterator() { TestIterator(); TestIterator(); TestIterator(); TestIterator(); TestIterator(); TestIterator(); } template void Test() { TestPointer(); TestIterator(); } template void Test() { Test(); Test(); Test(); Test(); } template void Test() { Test(); Test(); Test(); } template void Test() { Test(); Test(); Test(); } template void Test() { Test(); Test(); Test(); } int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); Test<256>(); }