upstream/0000775000175000017500000000000014637253000011356 5ustar kaolkaolupstream/.clang-format0000664000175000017500000000654214637252753013756 0ustar kaolkaol# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: All ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- upstream/shared/0000775000175000017500000000000014637253000012624 5ustar kaolkaolupstream/shared/rocfft_accuracy_test.h0000664000175000017500000000250514637252753017211 0ustar kaolkaol// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_ACCURACY_TEST #define ROCFFT_ACCURACY_TEST #include "accuracy_test.h" #include "rocfft_params.h" void fft_vs_reference(rocfft_params& params, bool round_trip = false); #endif upstream/shared/printbuffer.h0000664000175000017500000001024214637252753015340 0ustar kaolkaol// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef PRINTBUFFER_H #define PRINTBUFFER_H #include "hostbuf.h" #include "increment.h" #include #include // Output a formatted general-dimensional array with given length and stride in batches // separated by dist. template inline void printbuffer(const Toutput* output, const std::vector& length, const std::vector& stride, const Tsize nbatch, const Tsize dist, const size_t offset, Tstream& stream) { auto i_base = 0; for(unsigned int b = 0; b < nbatch; b++, i_base += dist) { std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const int i = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset); stream << output[i] << " "; for(int li = index.size(); li-- > 0;) { if(index[li] == (length[li] - 1)) { stream << "\n"; } else { break; } } } while(increment_rowmajor(index, length)); stream << std::endl; } } template class buffer_printer { // The scalar versions might be part of a planar format. public: template static void print_buffer(const std::vector& buf, const std::vector& length, const std::vector& stride, const Tsize nbatch, const Tsize dist, const std::vector& offset, Tstream& stream = std::cout) { for(const auto& vec : buf) { printbuffer(reinterpret_cast(vec.data()), length, stride, nbatch, dist, offset[0], stream); } }; template static void print_buffer_flat(const std::vector& buf, const std::vector& size, const std::vector& offset, Tstream& stream = std::cout) { for(const auto& vec : buf) { auto data = reinterpret_cast(vec.data()); stream << "idx " << 0; for(size_t i = 0; i < size[0]; ++i) stream << " " << data[i]; stream << std::endl; } }; }; #endif upstream/shared/rocfft_params.h0000664000175000017500000005476314637253000015642 0ustar kaolkaol// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_PARAMS_H #define ROCFFT_PARAMS_H #include "../shared/fft_params.h" #include "../shared/gpubuf.h" #include "rocfft/rocfft.h" // Return the string of the rocfft_status code static std::string rocfft_status_to_string(const rocfft_status ret) { switch(ret) { case rocfft_status_success: return "rocfft_status_success"; case rocfft_status_failure: return "rocfft_status_failure"; case rocfft_status_invalid_arg_value: return "rocfft_status_invalid_arg_value"; case rocfft_status_invalid_dimensions: return "rocfft_status_invalid_dimensions"; case rocfft_status_invalid_array_type: return "rocfft_status_invalid_array_type"; case rocfft_status_invalid_strides: return "rocfft_status_invalid_strides"; case rocfft_status_invalid_distance: return "rocfft_status_invalid_distance"; case rocfft_status_invalid_offset: return "rocfft_status_invalid_offset"; case rocfft_status_invalid_work_buffer: return "rocfft_status_invalid_work_buffer"; default: throw std::runtime_error("unknown rocfft_status"); } } inline fft_status fft_status_from_rocfftparams(const rocfft_status val) { switch(val) { case rocfft_status_success: return fft_status_success; case rocfft_status_failure: return fft_status_failure; case rocfft_status_invalid_arg_value: return fft_status_invalid_arg_value; case rocfft_status_invalid_dimensions: return fft_status_invalid_dimensions; case rocfft_status_invalid_array_type: return fft_status_invalid_array_type; case rocfft_status_invalid_strides: return fft_status_invalid_strides; case rocfft_status_invalid_distance: return fft_status_invalid_distance; case rocfft_status_invalid_offset: return fft_status_invalid_offset; case rocfft_status_invalid_work_buffer: return fft_status_invalid_work_buffer; default: throw std::runtime_error("Invalid status"); } } inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val) { switch(val) { case fft_precision_single: return rocfft_precision_single; case fft_precision_double: return rocfft_precision_double; case fft_precision_half: return rocfft_precision_half; default: throw std::runtime_error("Invalid precision"); } } inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val) { switch(val) { case fft_array_type_complex_interleaved: return rocfft_array_type_complex_interleaved; case fft_array_type_complex_planar: return rocfft_array_type_complex_planar; case fft_array_type_real: return rocfft_array_type_real; case fft_array_type_hermitian_interleaved: return rocfft_array_type_hermitian_interleaved; case fft_array_type_hermitian_planar: return rocfft_array_type_hermitian_planar; case fft_array_type_unset: return rocfft_array_type_unset; } return rocfft_array_type_unset; } inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val) { switch(val) { case fft_transform_type_complex_forward: return rocfft_transform_type_complex_forward; case fft_transform_type_complex_inverse: return rocfft_transform_type_complex_inverse; case fft_transform_type_real_forward: return rocfft_transform_type_real_forward; case fft_transform_type_real_inverse: return rocfft_transform_type_real_inverse; default: throw std::runtime_error("Invalid transform type"); } } inline rocfft_result_placement rocfft_result_placement_from_fftparams(const fft_result_placement val) { switch(val) { case fft_placement_inplace: return rocfft_placement_inplace; case fft_placement_notinplace: return rocfft_placement_notinplace; default: throw std::runtime_error("Invalid result placement"); } } class rocfft_params : public fft_params { public: rocfft_plan plan = nullptr; rocfft_execution_info info = nullptr; rocfft_plan_description desc = nullptr; gpubuf_t wbuffer; explicit rocfft_params(){}; explicit rocfft_params(const fft_params& p) : fft_params(p){}; rocfft_params(const rocfft_params&) = delete; rocfft_params& operator=(const rocfft_params&) = delete; ~rocfft_params() { free(); }; void free() { if(plan != nullptr) { rocfft_plan_destroy(plan); plan = nullptr; } if(info != nullptr) { rocfft_execution_info_destroy(info); info = nullptr; } if(desc != nullptr) { rocfft_plan_description_destroy(desc); desc = nullptr; } wbuffer.free(); } void validate_fields() const override { // row-major lengths including batch (i.e. batch is at the front) std::vector length_with_batch{nbatch}; std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch)); auto validate_field = [&](const fft_field& f) { for(const auto& b : f.bricks) { // bricks must have same dim as FFT, including batch if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1 || b.stride.size() != length.size() + 1) throw std::runtime_error( "brick dimension does not match FFT + batch dimension"); // ensure lower < upper, and that both fit in the FFT + batch dims if(!std::lexicographical_compare( b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end())) throw std::runtime_error("brick lower index is not less than upper index"); if(!std::lexicographical_compare(b.lower.begin(), b.lower.end(), length_with_batch.begin(), length_with_batch.end())) throw std::runtime_error( "brick lower index is not less than FFT + batch length"); if(!std::lexicographical_compare(b.upper.begin(), b.upper.end(), length_with_batch.begin(), length_with_batch.end()) && b.upper != length_with_batch) throw std::runtime_error("brick upper index is not <= FFT + batch length"); } }; for(const auto& ifield : ifields) validate_field(ifield); for(const auto& ofield : ofields) validate_field(ofield); } rocfft_precision get_rocfft_precision() { return rocfft_precision_from_fftparams(precision); } size_t vram_footprint() override { size_t val = fft_params::vram_footprint(); if(setup_structs() != fft_status_success) { throw std::runtime_error("Struct setup failed"); } val += workbuffersize; return val; } // Convert the generic fft_field structure to a rocfft_field // structure that can be passed to rocFFT. In particular, we need // to convert from row-major to column-major. static rocfft_field fft_field_to_rocfft_field(const fft_field& f) { rocfft_field rfield = nullptr; if(f.bricks.empty()) return rfield; if(rocfft_field_create(&rfield) != rocfft_status_success) throw std::runtime_error("rocfft_field_create failed"); for(const auto& b : f.bricks) { // rocFFT wants column-major bricks and fft_params stores // row-major std::vector lower_cm; std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm)); std::vector upper_cm; std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm)); std::vector stride_cm; std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm)); rocfft_brick rbrick = nullptr; if(rocfft_brick_create(&rbrick, lower_cm.data(), // field_lower upper_cm.data(), // field_upper stride_cm.data(), // brick_stride lower_cm.size(), // dim b.device) // deviceID != rocfft_status_success) throw std::runtime_error("rocfft_brick_create failed"); if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success) throw std::runtime_error("rocfft_field_add_brick failed"); rocfft_brick_destroy(rbrick); } return rfield; } fft_status setup_structs() { rocfft_status fft_status = rocfft_status_success; if(desc == nullptr) { rocfft_plan_description_create(&desc); if(fft_status != rocfft_status_success) return fft_status_from_rocfftparams(fft_status); fft_status = rocfft_plan_description_set_data_layout(desc, rocfft_array_type_from_fftparams(itype), rocfft_array_type_from_fftparams(otype), ioffset.data(), ooffset.data(), istride_cm().size(), istride_cm().data(), idist, ostride_cm().size(), ostride_cm().data(), odist); if(fft_status != rocfft_status_success) { throw std::runtime_error("rocfft_plan_description_set_data_layout failed"); } if(scale_factor != 1.0) { fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor); if(fft_status != rocfft_status_success) { throw std::runtime_error("rocfft_plan_description_set_scale_factor failed"); } } for(const auto& ifield : ifields) { rocfft_field infield = fft_field_to_rocfft_field(ifield); if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success) throw std::runtime_error("rocfft_description_add_infield failed"); rocfft_field_destroy(infield); } for(const auto& ofield : ofields) { rocfft_field outfield = fft_field_to_rocfft_field(ofield); if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success) throw std::runtime_error("rocfft_description_add_outfield failed"); rocfft_field_destroy(outfield); } } if(plan == nullptr) { fft_status = rocfft_plan_create(&plan, rocfft_result_placement_from_fftparams(placement), rocfft_transform_type_from_fftparams(transform_type), get_rocfft_precision(), length_cm().size(), length_cm().data(), nbatch, desc); if(fft_status != rocfft_status_success) { throw std::runtime_error("rocfft_plan_create failed"); } } if(info == nullptr) { fft_status = rocfft_execution_info_create(&info); if(fft_status != rocfft_status_success) { throw std::runtime_error("rocfft_execution_info_create failed"); } } fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize); if(fft_status != rocfft_status_success) { throw std::runtime_error("rocfft_plan_get_work_buffer_size failed"); } return fft_status_from_rocfftparams(fft_status); } fft_status create_plan() override { fft_status ret = setup_structs(); if(ret != fft_status_success) { return ret; } if(workbuffersize > 0) { hipError_t hip_status = hipSuccess; hip_status = wbuffer.alloc(workbuffersize); if(hip_status != hipSuccess) { std::ostringstream oss; oss << "work buffer allocation failed (" << workbuffersize << " requested)"; size_t mem_free = 0; size_t mem_total = 0; hip_status = hipMemGetInfo(&mem_free, &mem_total); if(hip_status == hipSuccess) { oss << "free vram: " << mem_free << " total vram: " << mem_total; } else { oss << "hipMemGetInfo also failed"; } throw work_buffer_alloc_failure(oss.str()); } auto rocret = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize); if(rocret != rocfft_status_success) { throw std::runtime_error("rocfft_execution_info_set_work_buffer failed"); } } return ret; } fft_status set_callbacks(void* load_cb_host, void* load_cb_data, void* store_cb_host, void* store_cb_data) override { if(run_callbacks) { auto roc_status = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0); if(roc_status != rocfft_status_success) return fft_status_from_rocfftparams(roc_status); roc_status = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0); if(roc_status != rocfft_status_success) return fft_status_from_rocfftparams(roc_status); } return fft_status_success; } fft_status execute(void** in, void** out) override { auto ret = rocfft_execute(plan, in, out, info); return fft_status_from_rocfftparams(ret); } // scatter data to multiple GPUs and adjust I/O buffers to match void multi_gpu_prepare(std::vector& ibuffer, std::vector& pibuffer, std::vector& pobuffer) override { auto alloc_fields = [&](const fft_params::fft_field& field, fft_array_type array_type, std::vector& pbuffer, bool copy_input) { if(field.bricks.empty()) return; // we have a field defined, clear the list of buffers as // we'll be allocating new ones for each brick pbuffer.clear(); for(const auto& b : field.bricks) { // get brick's length - note that this includes batch // dimension const auto brick_len = b.length(); const auto brick_stride = b.stride; const size_t brick_size_elems = product(brick_len.begin(), brick_len.end()); const size_t elem_size_bytes = var_size(precision, array_type); const size_t brick_size_bytes = brick_size_elems * elem_size_bytes; // set device for the alloc, but we want to return to the // default device as the source of a following memcpy { rocfft_scoped_device dev(b.device); multi_gpu_data.emplace_back(); if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess) throw std::runtime_error("device allocation failure"); pbuffer.push_back(multi_gpu_data.back().data()); } if(copy_input) { // For now, assume we're only splitting on highest FFT // dimension, lower-dimensional FFT data is all // contiguous, and batches are contiguous in each brick. // // That means we can express this as a 2D memcpy. const size_t unbatched_elems_per_brick = product(brick_len.begin() + 1, brick_len.end()); const size_t unbatched_elems_per_fft = product(length.begin(), length.end()); // get this brick's starting offset in the field const size_t brick_offset = b.lower_field_offset(istride, idist) * elem_size_bytes; // copy from original input - note that we're // assuming interleaved data so ibuffer has only one // gpubuf if(hipMemcpy2D(pbuffer.back(), unbatched_elems_per_brick * elem_size_bytes, ibuffer.front().data_offset(brick_offset), unbatched_elems_per_fft * elem_size_bytes, unbatched_elems_per_brick * elem_size_bytes, brick_len.front(), hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failure"); } } // if we copied the input to all the other devices, and // this is an out-of-place transform, we no longer // need the original input if(copy_input && placement == fft_placement_notinplace) ibuffer.clear(); }; // assume one input, one output field for simple cases if(!ifields.empty()) alloc_fields(ifields.front(), itype, pibuffer, true); if(!ofields.empty()) { if(!ifields.empty() && placement == fft_placement_inplace) pobuffer = pibuffer; else alloc_fields(ofields.front(), otype, pobuffer, false); } } // when preparing for multi-GPU transform, we need to allocate data // on each GPU. This vector remembers all of those allocations. std::vector multi_gpu_data; // gather data after multi-GPU FFT for verification void multi_gpu_finalize(std::vector& obuffer, std::vector& pobuffer) override { if(ofields.empty()) return; for(size_t i = 0; i < ofields.front().bricks.size(); ++i) { const auto& b = ofields.front().bricks[i]; const auto& brick_ptr = pobuffer[i]; const auto brick_len = b.length(); const size_t elem_size_bytes = var_size(precision, otype); // get this brick's starting offset in the field const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes; // switch device to where we're copying from rocfft_scoped_device dev(b.device); // For now, assume we're only splitting on highest FFT // dimension, lower-dimensional FFT data is all // contiguous, and batches are contiguous in each brick. // // That means we can express this as a 2D memcpy. const size_t unbatched_elems_per_brick = product(brick_len.begin() + 1, brick_len.end()); const auto output_length = olength(); const size_t unbatched_elems_per_fft = product(output_length.begin(), output_length.end()); // copy to original output buffer - note that // we're assuming interleaved data so obuffer // has only one gpubuf if(hipMemcpy2D(obuffer.front().data_offset(brick_offset), unbatched_elems_per_fft * elem_size_bytes, brick_ptr, unbatched_elems_per_brick * elem_size_bytes, unbatched_elems_per_brick * elem_size_bytes, brick_len.front(), hipMemcpyDeviceToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failure"); // device-to-device transfers don't synchronize with the // host, add explicit sync (void)hipDeviceSynchronize(); } pobuffer.clear(); pobuffer.push_back(obuffer.front().data()); } }; #endif upstream/shared/arithmetic.h0000664000175000017500000000401714637253000015130 0ustar kaolkaol/****************************************************************************** * Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #pragma once #include #include // arithmetic helper functions static inline bool IsPo2(size_t u) { return (u != 0) && (0 == (u & (u - 1))); } // help function: Find the smallest power of 2 that is >= n; return its // power of 2 factor // e.g., CeilPo2 (7) returns 3 : (2^3 >= 7) static inline size_t CeilPo2(size_t n) { size_t v = 1, t = 0; while(v < n) { v <<= 1; t++; } return t; } template static inline T DivRoundingUp(T a, T b) { return (a + (b - 1)) / b; } template typename Titer::value_type product(Titer begin, Titer end) { return std::accumulate( begin, end, typename Titer::value_type(1), std::multiplies()); } upstream/shared/enum_to_string.h0000664000175000017500000000620214637252753016047 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ENUM_TO_STRING_H #define ENUM_TO_STRING_H #include "fft_params.h" // Return the string of the hipError code. static std::string hipError_to_string(const hipError_t ret) { switch(ret) { case hipSuccess: return "hipSuccess"; case hipErrorInvalidContext: return "hipErrorInvalidContext"; case hipErrorInvalidKernelFile: return "hipErrorInvalidKernelFile"; case hipErrorMemoryAllocation: return "hipErrorMemoryAllocation"; case hipErrorInitializationError: return "hipErrorInitializationError"; case hipErrorLaunchFailure: return "hipErrorLaunchFailure"; case hipErrorLaunchOutOfResources: return "hipErrorLaunchOutOfResources"; case hipErrorInvalidDevice: return "hipErrorInvalidDevice"; case hipErrorInvalidValue: return "hipErrorInvalidValue"; case hipErrorInvalidDevicePointer: return "hipErrorInvalidDevicePointer"; case hipErrorInvalidMemcpyDirection: return "hipErrorInvalidMemcpyDirection"; case hipErrorUnknown: return "hipErrorUnknown"; case hipErrorInvalidResourceHandle: return "hipErrorInvalidResourceHandle"; case hipErrorNotReady: return "hipErrorNotReady"; case hipErrorNoDevice: return "hipErrorNoDevice"; case hipErrorPeerAccessAlreadyEnabled: return "hipErrorPeerAccessAlreadyEnabled"; case hipErrorPeerAccessNotEnabled: return "hipErrorPeerAccessNotEnabled"; case hipErrorRuntimeMemory: return "hipErrorRuntimeMemory"; case hipErrorRuntimeOther: return "hipErrorRuntimeOther"; case hipErrorHostMemoryAlreadyRegistered: return "hipErrorHostMemoryAlreadyRegistered"; case hipErrorHostMemoryNotRegistered: return "hipErrorHostMemoryNotRegistered"; case hipErrorMapBufferObjectFailed: return "hipErrorMapBufferObjectFailed"; case hipErrorTbd: return "hipErrorTbd"; default: throw std::runtime_error("unknown hipError"); } } #endif upstream/shared/gpubuf.h0000664000175000017500000000716414637252753014313 0ustar kaolkaol// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_GPUBUF_H #define ROCFFT_GPUBUF_H #include "rocfft_hip.h" #include // Simple RAII class for GPU buffers. T is the type of pointer that // data() returns template class gpubuf_t { public: gpubuf_t() {} // buffers are movable but not copyable gpubuf_t(gpubuf_t&& other) { std::swap(buf, other.buf); std::swap(bsize, other.bsize); std::swap(device, other.device); } gpubuf_t& operator=(gpubuf_t&& other) { std::swap(buf, other.buf); std::swap(bsize, other.bsize); std::swap(device, other.device); return *this; } gpubuf_t(const gpubuf_t&) = delete; gpubuf_t& operator=(const gpubuf_t&) = delete; ~gpubuf_t() { free(); } static bool use_alloc_managed() { return std::getenv("ROCFFT_MALLOC_MANAGED"); } hipError_t alloc(const size_t size) { // remember the device that was current as of alloc, so we can // free on the correct device auto ret = hipGetDevice(&device); if(ret != hipSuccess) return ret; bsize = size; static bool alloc_managed = use_alloc_managed(); free(); ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize); if(ret != hipSuccess) { buf = nullptr; bsize = 0; } return ret; } size_t size() const { return bsize; } void free() { if(buf != nullptr) { // free on the device we allocated on rocfft_scoped_device dev(device); (void)hipFree(buf); buf = nullptr; bsize = 0; } } // return a pointer to the allocated memory, offset by the // specified number of bytes T* data_offset(size_t offset_bytes = 0) const { void* ptr = static_cast(buf) + offset_bytes; return static_cast(ptr); } T* data() const { return static_cast(buf); } // equality/bool tests bool operator==(std::nullptr_t n) const { return buf == n; } bool operator!=(std::nullptr_t n) const { return buf != n; } operator bool() const { return buf; } private: // The GPU buffer void* buf = nullptr; size_t bsize = 0; int device = 0; }; // default gpubuf that gives out void* pointers typedef gpubuf_t<> gpubuf; #endif upstream/shared/work_queue.h0000664000175000017500000000340714637252753015205 0ustar kaolkaol// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #include #include #include template struct WorkQueue { void push(_WorkItem&& i) { std::unique_lock lock(queueMutex); items.emplace(std::move(i)); emptyWait.notify_all(); } _WorkItem pop() { std::unique_lock lock(queueMutex); while(items.empty()) emptyWait.wait(lock); _WorkItem item(items.front()); items.pop(); return item; } private: std::queue<_WorkItem> items; std::mutex queueMutex; std::condition_variable emptyWait; }; upstream/shared/ptrdiff.h0000664000175000017500000000330314637253000014432 0ustar kaolkaol// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once // Compute the farthest point from the original pointer. static size_t compute_ptrdiff(const std::vector& length, const std::vector& stride, const size_t nbatch, const size_t dist) { size_t val = 0; if(!length.empty()) { val = 1; for(unsigned int i = 0; i < length.size(); ++i) { val += (length[i] - 1) * stride[i]; } val += (nbatch - 1) * dist; } return val; } upstream/shared/fftw_transform.h0000664000175000017500000005423514637252753016065 0ustar kaolkaol// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef FFTWTRANSFORM_H #define FFTWTRANSFORM_H #include "hostbuf.h" #include "rocfft_complex.h" #include "test_params.h" #include #include // Function to return maximum error for float and double types. // // Following Schatzman (1996; Accuracy of the Discrete Fourier // Transform and the Fast Fourier Transform), the shape of relative // l_2 error vs length should look like // // epsilon * sqrt(log2(length)). // // The magic epsilon constants below were chosen so that we get a // reasonable upper bound for (all of) our tests. // // For rocFFT, prime lengths result in the highest error. As such, // the epsilons below are perhaps too loose for pow2 lengths; but they // are appropriate for prime lengths. template inline double type_epsilon(); template <> inline double type_epsilon<_Float16>() { return half_epsilon; } template <> inline double type_epsilon() { return single_epsilon; } template <> inline double type_epsilon() { return double_epsilon; } // C++ traits to translate float->fftwf_complex and // double->fftw_complex. // The correct FFTW complex type can be accessed via, for example, // using complex_t = typename fftw_complex_trait::complex_t; template struct fftw_trait; template <> struct fftw_trait<_Float16> { // fftw does not support half precision, so use single precision and convert using fftw_complex_type = fftwf_complex; using fftw_plan_type = fftwf_plan; }; template <> struct fftw_trait { using fftw_complex_type = fftwf_complex; using fftw_plan_type = fftwf_plan; }; template <> struct fftw_trait { using fftw_complex_type = fftw_complex; using fftw_plan_type = fftw_plan; }; // Copies the half-precision input buffer to a single-precision // buffer. Note that the input buffer is already sized like it's a // single-precision buffer (but only half of it is filled), because // we allocate a single-precision buffer for FFTW to plan with. static hostbuf half_to_single_copy(const hostbuf& in) { auto out = in.copy(); auto in_begin = reinterpret_cast(in.data()); std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast(out.data())); return out; } // converts a wider precision buffer to a narrower precision, in-place template void narrow_precision_inplace(hostbuf& in) { // ensure we're actually shrinking the data static_assert(sizeof(TfloatIn) > sizeof(TfloatOut)); auto readPtr = reinterpret_cast(in.data()); auto writePtr = reinterpret_cast(in.data()); std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr); in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut))); } static void single_to_half_inplace(hostbuf& in) { narrow_precision_inplace(in); } // Template wrappers for real-valued FFTW allocators: template inline Tfloat* fftw_alloc_real_type(size_t n); template <> inline float* fftw_alloc_real_type(size_t n) { return fftwf_alloc_real(n); } template <> inline double* fftw_alloc_real_type(size_t n) { return fftw_alloc_real(n); } // Template wrappers for complex-valued FFTW allocators: template inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n); template <> inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n) { return fftwf_alloc_complex(n); } template <> inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n) { return fftw_alloc_complex(n); } template inline fftw_type* fftw_alloc_type(size_t n); template <> inline float* fftw_alloc_type(size_t n) { return fftw_alloc_real_type(n); } template <> inline double* fftw_alloc_type(size_t n) { return fftw_alloc_real_type(n); } template <> inline fftwf_complex* fftw_alloc_type(size_t n) { return fftw_alloc_complex_type(n); } template <> inline fftw_complex* fftw_alloc_type(size_t n) { return fftw_alloc_complex_type(n); } template <> inline rocfft_complex* fftw_alloc_type>(size_t n) { return (rocfft_complex*)fftw_alloc_complex_type(n); } template <> inline rocfft_complex* fftw_alloc_type>(size_t n) { return (rocfft_complex*)fftw_alloc_complex_type(n); } // Template wrappers for FFTW plan executors: template inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan); template <> inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan) { return fftwf_execute(plan); } template <> inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan) { return fftw_execute(plan); } // Template wrappers for FFTW plan destroyers: template inline void fftw_destroy_plan_type(Tfftw_plan plan); template <> inline void fftw_destroy_plan_type(fftwf_plan plan) { return fftwf_destroy_plan(plan); } template <> inline void fftw_destroy_plan_type(fftw_plan plan) { return fftw_destroy_plan(plan); } // Template wrappers for FFTW c2c planners: template inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, typename fftw_trait::fftw_complex_type* out, int sign, unsigned flags); template <> inline typename fftw_trait<_Float16>::fftw_plan_type fftw_plan_guru64_dft<_Float16>(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait<_Float16>::fftw_complex_type* in, typename fftw_trait<_Float16>::fftw_complex_type* out, int sign, unsigned flags) { return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, typename fftw_trait::fftw_complex_type* out, int sign, unsigned flags) { return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, typename fftw_trait::fftw_complex_type* out, int sign, unsigned flags) { return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); } // Template wrappers for FFTW c2c executors: template inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out); template <> inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan, std::vector& in, std::vector& out) { // since FFTW does not natively support half precision, convert // input to single, execute, then convert output back to half auto in_single = half_to_single_copy(in.front()); fftwf_execute_dft(plan, reinterpret_cast(in_single.data()), reinterpret_cast(out.front().data())); single_to_half_inplace(out.front()); } template <> inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftwf_execute_dft(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } template <> inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftw_execute_dft(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } // Template wrappers for FFTW r2c planners: template inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, Tfloat* in, typename fftw_trait::fftw_complex_type* out, unsigned flags); template <> inline typename fftw_trait<_Float16>::fftw_plan_type fftw_plan_guru64_r2c<_Float16>(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, _Float16* in, typename fftw_trait<_Float16>::fftw_complex_type* out, unsigned flags) { return fftwf_plan_guru64_dft_r2c( rank, dims, howmany_rank, howmany_dims, reinterpret_cast(in), out, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, float* in, typename fftw_trait::fftw_complex_type* out, unsigned flags) { return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, double* in, typename fftw_trait::fftw_complex_type* out, unsigned flags) { return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); } // Template wrappers for FFTW r2c executors: template inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out); template <> inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { // since FFTW does not natively support half precision, convert // input to single, execute, then convert output back to half auto in_single = half_to_single_copy(in.front()); fftwf_execute_dft_r2c(plan, reinterpret_cast(in_single.data()), reinterpret_cast(out.front().data())); single_to_half_inplace(out.front()); } template <> inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftwf_execute_dft_r2c(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } template <> inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftw_execute_dft_r2c(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } // Template wrappers for FFTW c2r planners: template inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, Tfloat* out, unsigned flags); template <> inline typename fftw_trait<_Float16>::fftw_plan_type fftw_plan_guru64_c2r<_Float16>(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait<_Float16>::fftw_complex_type* in, _Float16* out, unsigned flags) { return fftwf_plan_guru64_dft_c2r( rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast(out), flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, float* out, unsigned flags) { return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, double* out, unsigned flags) { return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); } // Template wrappers for FFTW c2r executors: template inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out); template <> inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { // since FFTW does not natively support half precision, convert // input to single, execute, then convert output back to half auto in_single = half_to_single_copy(in.front()); fftwf_execute_dft_c2r(plan, reinterpret_cast(in_single.data()), reinterpret_cast(out.front().data())); single_to_half_inplace(out.front()); } template <> inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftwf_execute_dft_c2r(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } template <> inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftw_execute_dft_c2r(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } #ifdef FFTW_HAVE_SPRINT_PLAN // Template wrappers for FFTW print plan: template inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan); template <> inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan) { return fftwf_sprint_plan(plan); } template <> inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) { return fftwf_sprint_plan(plan); } template <> inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) { return fftw_sprint_plan(plan); } #endif #endif upstream/shared/rocfft_complex.h0000664000175000017500000002322714637252753016033 0ustar kaolkaol// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_COMPLEX_H #define ROCFFT_COMPLEX_H #include #if !defined(__HIPCC_RTC__) #include #endif #include #include #ifdef __HIP_PLATFORM_NVIDIA__ typedef __half _Float16; #endif template struct rocfft_complex { Treal x; // Real part Treal y; // Imaginary part // Constructors // Do not initialize the members x or y by default, to ensure that it can // be used in __shared__ and that it is a trivial class compatible with C. __device__ __host__ rocfft_complex() = default; __device__ __host__ rocfft_complex(const rocfft_complex&) = default; __device__ __host__ rocfft_complex(rocfft_complex&&) = default; __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default; __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default; __device__ __host__ ~rocfft_complex() = default; // Constructor from real and imaginary parts __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag) : x{real} , y{imag} { } // Conversion from different precision template __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex& z) : x(z.x) , y(z.y) { } // Accessors __device__ __host__ constexpr Treal real() const { return x; } __device__ __host__ constexpr Treal imag() const { return y; } // Unary operations __forceinline__ __device__ __host__ rocfft_complex operator-() const { return {-x, -y}; } __forceinline__ __device__ __host__ rocfft_complex operator+() const { return *this; } __device__ __host__ Treal asum(const rocfft_complex& z) { return abs(z.x) + abs(z.y); } // Internal real functions static __forceinline__ __device__ __host__ Treal abs(Treal x) { return x < 0 ? -x : x; } static __forceinline__ __device__ __host__ float sqrt(float x) { return ::sqrtf(x); } static __forceinline__ __device__ __host__ double sqrt(double x) { return ::sqrt(x); } // Addition operators __device__ __host__ auto& operator+=(const rocfft_complex& rhs) { return *this = {x + rhs.x, y + rhs.y}; } __device__ __host__ auto operator+(const rocfft_complex& rhs) const { auto lhs = *this; return lhs += rhs; } // Subtraction operators __device__ __host__ auto& operator-=(const rocfft_complex& rhs) { return *this = {x - rhs.x, y - rhs.y}; } __device__ __host__ auto operator-(const rocfft_complex& rhs) const { auto lhs = *this; return lhs -= rhs; } // Multiplication operators __device__ __host__ auto& operator*=(const rocfft_complex& rhs) { return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y}; } __device__ __host__ auto operator*(const rocfft_complex& rhs) const { auto lhs = *this; return lhs *= rhs; } // Division operators __device__ __host__ auto& operator/=(const rocfft_complex& rhs) { // Form of Robert L. Smith's Algorithm 116 if(abs(rhs.x) > abs(rhs.y)) { Treal ratio = rhs.y / rhs.x; Treal scale = 1 / (rhs.x + rhs.y * ratio); *this = {(x + y * ratio) * scale, (y - x * ratio) * scale}; } else { Treal ratio = rhs.x / rhs.y; Treal scale = 1 / (rhs.x * ratio + rhs.y); *this = {(y + x * ratio) * scale, (y * ratio - x) * scale}; } return *this; } __device__ __host__ auto operator/(const rocfft_complex& rhs) const { auto lhs = *this; return lhs /= rhs; } // Comparison operators __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const { return x == rhs.x && y == rhs.y; } __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const { return !(*this == rhs); } // Operators for complex-real computations template __device__ __host__ auto& operator+=(const U& rhs) { return (x += Treal(rhs)), *this; } template __device__ __host__ auto& operator-=(const U& rhs) { return (x -= Treal(rhs)), *this; } __device__ __host__ auto operator+(const Treal& rhs) { auto lhs = *this; return lhs += rhs; } __device__ __host__ auto operator-(const Treal& rhs) { auto lhs = *this; return lhs -= rhs; } template __device__ __host__ auto& operator*=(const U& rhs) { return (x *= Treal(rhs)), (y *= Treal(rhs)), *this; } template __device__ __host__ auto operator*(const U& rhs) const { auto lhs = *this; return lhs *= Treal(rhs); } template __device__ __host__ auto& operator/=(const U& rhs) { return (x /= Treal(rhs)), (y /= Treal(rhs)), *this; } template __device__ __host__ auto operator/(const U& rhs) const { auto lhs = *this; return lhs /= Treal(rhs); } template __device__ __host__ constexpr bool operator==(const U& rhs) const { return x == Treal(rhs) && y == 0; } template __device__ __host__ constexpr bool operator!=(const U& rhs) const { return !(*this == rhs); } }; // Stream operators #if !defined(__HIPCC_RTC__) static std::ostream& operator<<(std::ostream& stream, const _Float16& f) { return stream << static_cast(f); } template std::ostream& operator<<(std::ostream& out, const rocfft_complex& z) { return out << '(' << static_cast(z.x) << ',' << static_cast(z.y) << ')'; } #endif // Operators for real-complex computations template __device__ __host__ rocfft_complex operator+(const U& lhs, const rocfft_complex& rhs) { return {Treal(lhs) + rhs.x, rhs.y}; } template __device__ __host__ rocfft_complex operator-(const U& lhs, const rocfft_complex& rhs) { return {Treal(lhs) - rhs.x, -rhs.y}; } template __device__ __host__ rocfft_complex operator*(const U& lhs, const rocfft_complex& rhs) { return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y}; } template __device__ __host__ rocfft_complex operator/(const U& lhs, const rocfft_complex& rhs) { // Form of Robert L. Smith's Algorithm 116 if(rocfft_complex::abs(rhs.x) > rocfft_complex::abs(rhs.y)) { Treal ratio = rhs.y / rhs.x; Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio); return {scale, -scale * ratio}; } else { Treal ratio = rhs.x / rhs.y; Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y); return {ratio * scale, -scale}; } } template __device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex& rhs) { return Treal(lhs) == rhs.x && 0 == rhs.y; } template __device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex& rhs) { return !(lhs == rhs); } // Extending std namespace to handle rocfft_complex datatype namespace std { template __device__ __host__ constexpr Treal real(const rocfft_complex& z) { return z.x; } template __device__ __host__ constexpr Treal imag(const rocfft_complex& z) { return z.y; } template __device__ __host__ constexpr rocfft_complex conj(const rocfft_complex& z) { return {z.x, -z.y}; } template __device__ __host__ inline Treal norm(const rocfft_complex& z) { return (z.x * z.x) + (z.y * z.y); } template __device__ __host__ inline Treal abs(const rocfft_complex& z) { Treal tr = rocfft_complex::abs(z.x), ti = rocfft_complex::abs(z.y); return tr > ti ? (ti /= tr, tr * rocfft_complex::sqrt(ti * ti + 1)) : ti ? (tr /= ti, ti * rocfft_complex::sqrt(tr * tr + 1)) : 0; } } #endif // ROCFFT_COMPLEX_H upstream/shared/hostbuf.h0000664000175000017500000001032314637253000014446 0ustar kaolkaol// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_HOSTBUF_H #define ROCFFT_HOSTBUF_H #include "arithmetic.h" #include #include #ifndef WIN32 #include #include #endif // Simple RAII class for host buffers. T is the type of pointer that // data() returns template class hostbuf_t { public: hostbuf_t() {} // buffers are movable but not copyable hostbuf_t(hostbuf_t&& other) { std::swap(buf, other.buf); std::swap(bsize, other.bsize); } hostbuf_t& operator=(hostbuf_t&& other) { std::swap(buf, other.buf); std::swap(bsize, other.bsize); return *this; } hostbuf_t(const hostbuf_t&) = delete; hostbuf_t& operator=(const hostbuf_t&) = delete; ~hostbuf_t() { free(); } void alloc(size_t size) { bsize = size; free(); // we're aligning to multiples of 64 bytes, so round the // allocation size up to the nearest 64 to keep ASAN happy if(size % 64) { size += 64 - size % 64; } // FFTW requires aligned allocations to use faster SIMD instructions. // If enabling hugepages, align to 2 MiB. Otherwise, aligning to // 64 bytes is enough for AVX instructions up to AVX512. #ifdef WIN32 buf = _aligned_malloc(size, 64); #else // On Linux, ask for hugepages to reduce TLB pressure and // improve performance. Allocations need to be aligned to // the hugepage size, and rounded up to the next whole // hugepage. static const size_t TWO_MiB = 2 * 1024 * 1024; if(size >= TWO_MiB) { size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB; buf = aligned_alloc(TWO_MiB, rounded_size); madvise(buf, rounded_size, MADV_HUGEPAGE); } else buf = aligned_alloc(64, size); #endif } size_t size() const { return bsize; } void free() { if(buf != nullptr) { #ifdef WIN32 _aligned_free(buf); #else std::free(buf); #endif buf = nullptr; bsize = 0; } } T* data() const { return static_cast(buf); } // Copy method hostbuf_t copy() const { hostbuf_t copy; copy.alloc(bsize); memcpy(copy.buf, buf, bsize); return copy; } // shrink the buffer to fit the new size void shrink(size_t new_size) { if(new_size > bsize) throw std::runtime_error("can't shrink hostbuf to larger size"); // just pretend the buffer is now that size bsize = new_size; } // equality/bool tests bool operator==(std::nullptr_t n) const { return buf == n; } bool operator!=(std::nullptr_t n) const { return buf != n; } operator bool() const { return buf; } private: // The host buffer void* buf = nullptr; size_t bsize = 0; }; // default hostbuf that gives out void* pointers typedef hostbuf_t<> hostbuf; #endif upstream/shared/fft_params.h0000664000175000017500000035723214637253000015133 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef FFT_PARAMS_H #define FFT_PARAMS_H #include #include #include #include #include #include #ifdef _OPENMP #include #endif #include #include #include #include #include "../shared/arithmetic.h" #include "../shared/array_validator.h" #include "../shared/data_gen_device.h" #include "../shared/data_gen_host.h" #include "../shared/device_properties.h" #include "../shared/printbuffer.h" #include "../shared/ptrdiff.h" enum fft_status { fft_status_success, fft_status_failure, fft_status_invalid_arg_value, fft_status_invalid_dimensions, fft_status_invalid_array_type, fft_status_invalid_strides, fft_status_invalid_distance, fft_status_invalid_offset, fft_status_invalid_work_buffer, }; enum fft_transform_type { fft_transform_type_complex_forward, fft_transform_type_complex_inverse, fft_transform_type_real_forward, fft_transform_type_real_inverse, }; enum fft_precision { fft_precision_half, fft_precision_single, fft_precision_double, }; static std::istream& operator>>(std::istream& str, fft_precision& precision) { std::string word; str >> word; if(word == "half") precision = fft_precision_half; else if(word == "single") precision = fft_precision_single; else if(word == "double") precision = fft_precision_double; else throw std::runtime_error("Invalid precision specified"); return str; } // fft_input_generator: linearly spaced sequence in [-0.5,0.5] // fft_input_random_generator: pseudo-random sequence in [-0.5,0.5] enum fft_input_generator { fft_input_random_generator_device, fft_input_random_generator_host, fft_input_generator_device, fft_input_generator_host, }; static std::istream& operator>>(std::istream& str, fft_input_generator& gen) { std::string word; str >> word; if(word == "0") gen = fft_input_random_generator_device; else if(word == "1") gen = fft_input_random_generator_host; else if(word == "2") gen = fft_input_generator_device; else if(word == "3") gen = fft_input_generator_host; else throw std::runtime_error("Invalid input generator specified"); return str; } enum fft_array_type { fft_array_type_complex_interleaved, fft_array_type_complex_planar, fft_array_type_real, fft_array_type_hermitian_interleaved, fft_array_type_hermitian_planar, fft_array_type_unset, }; enum fft_result_placement { fft_placement_inplace, fft_placement_notinplace, }; // Determine the size of the data type given the precision and type. template inline Tsize var_size(const fft_precision precision, const fft_array_type type) { size_t var_size = 0; switch(precision) { case fft_precision_half: var_size = sizeof(_Float16); break; case fft_precision_single: var_size = sizeof(float); break; case fft_precision_double: var_size = sizeof(double); break; } switch(type) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: var_size *= 2; break; default: break; } return var_size; } // Given an array type and transform length, strides, etc, load random floats in [0,1] // into the input array of floats/doubles or complex floats/doubles gpu buffers. template inline void set_input(std::vector& input, const fft_input_generator igen, const fft_array_type itype, const std::vector& length, const std::vector& ilength, const std::vector& istride, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch, const hipDeviceProp_t& deviceProp) { auto isize = count_iters(whole_length) * nbatch; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { auto ibuffer = (rocfft_complex*)input[0].data(); if(igen == fft_input_generator_device) generate_interleaved_data( whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp); else if(igen == fft_input_random_generator_device) generate_random_interleaved_data( whole_length, idist, isize, whole_stride, ibuffer, deviceProp); if(itype == fft_array_type_hermitian_interleaved) { auto ibuffer_2 = (rocfft_complex*)input[0].data(); impose_hermitian_symmetry_interleaved( length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp); } break; } case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { auto ibuffer_real = (Tfloat*)input[0].data(); auto ibuffer_imag = (Tfloat*)input[1].data(); if(igen == fft_input_generator_device) generate_planar_data(whole_length, idist, isize, whole_stride, nbatch, ibuffer_real, ibuffer_imag, deviceProp); else if(igen == fft_input_random_generator_device) generate_random_planar_data( whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp); if(itype == fft_array_type_hermitian_planar) impose_hermitian_symmetry_planar( length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp); break; } case fft_array_type_real: { auto ibuffer = (Tfloat*)input[0].data(); if(igen == fft_input_generator_device) generate_real_data( whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp); else if(igen == fft_input_random_generator_device) generate_random_real_data( whole_length, idist, isize, whole_stride, ibuffer, deviceProp); break; } default: throw std::runtime_error("Input layout format not yet supported"); } } template inline void set_input(std::vector& input, const fft_input_generator igen, const fft_array_type itype, const std::vector& length, const std::vector& ilength, const std::vector& istride, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch, const hipDeviceProp_t& deviceProp) { switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { if(igen == fft_input_generator_host) generate_interleaved_data(input, whole_length, whole_stride, idist, nbatch); else if(igen == fft_input_random_generator_host) generate_random_interleaved_data( input, whole_length, whole_stride, idist, nbatch); if(itype == fft_array_type_hermitian_interleaved) impose_hermitian_symmetry_interleaved(input, length, istride, idist, nbatch); break; } case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { if(igen == fft_input_generator_host) generate_planar_data(input, whole_length, whole_stride, idist, nbatch); else if(igen == fft_input_random_generator_host) generate_random_planar_data(input, whole_length, whole_stride, idist, nbatch); if(itype == fft_array_type_hermitian_planar) impose_hermitian_symmetry_planar(input, length, istride, idist, nbatch); break; } case fft_array_type_real: { if(igen == fft_input_generator_host) generate_real_data(input, whole_length, whole_stride, idist, nbatch); else if(igen == fft_input_random_generator_host) generate_random_real_data(input, whole_length, whole_stride, idist, nbatch); break; } default: throw std::runtime_error("Input layout format not yet supported"); } } // unroll set_input for dimension 1, 2, 3 template inline void set_input(std::vector& input, const fft_input_generator igen, const fft_array_type itype, const std::vector& length, const std::vector& ilength, const std::vector& istride, const size_t idist, const size_t nbatch, const hipDeviceProp_t& deviceProp) { switch(length.size()) { case 1: set_input(input, igen, itype, length, ilength, istride, ilength[0], istride[0], idist, nbatch, deviceProp); break; case 2: set_input(input, igen, itype, length, ilength, istride, std::make_tuple(ilength[0], ilength[1]), std::make_tuple(istride[0], istride[1]), idist, nbatch, deviceProp); break; case 3: set_input(input, igen, itype, length, ilength, istride, std::make_tuple(ilength[0], ilength[1], ilength[2]), std::make_tuple(istride[0], istride[1], istride[2]), idist, nbatch, deviceProp); break; default: abort(); } } // Container class for test parameters. class fft_params { public: // All parameters are row-major. std::vector length; std::vector istride; std::vector ostride; size_t nbatch = 1; fft_precision precision = fft_precision_single; fft_input_generator igen = fft_input_random_generator_device; fft_transform_type transform_type = fft_transform_type_complex_forward; fft_result_placement placement = fft_placement_inplace; size_t idist = 0; size_t odist = 0; fft_array_type itype = fft_array_type_unset; fft_array_type otype = fft_array_type_unset; std::vector ioffset = {0, 0}; std::vector ooffset = {0, 0}; std::vector isize; std::vector osize; size_t workbuffersize = 0; struct fft_brick { // all vectors here are row-major, with same length as FFT // dimension + 1 (for batch dimension) // inclusive lower bound of brick std::vector lower; // exclusive upper bound of brick std::vector upper; // stride of brick in memory std::vector stride; // compute the length of this brick std::vector length() const { std::vector ret; for(size_t i = 0; i < lower.size(); ++i) ret.push_back(upper[i] - lower[i]); return ret; } // compute offset of lower bound in a field with the given // stride + dist (batch stride is separate) size_t lower_field_offset(std::vector stride, size_t dist) const { // brick strides include batch, so adjust our input accordingly stride.insert(stride.begin(), dist); return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0); } // location of the brick int device = 0; }; struct fft_field { std::vector bricks; }; // optional brick decomposition of inputs/outputs std::vector ifields; std::vector ofields; // run testing load/store callbacks bool run_callbacks = false; static constexpr double load_cb_scalar = 0.457813941; static constexpr double store_cb_scalar = 0.391504938; // Check that data outside of output strides is not overwritten. // This is only set explicitly on some tests where there's space // between dimensions, but the dimensions are still in-order. // We're not trying to generically find holes in arbitrary data // layouts. // // NOTE: this flag is not included in tokens, since it doesn't // affect how the FFT library behaves. bool check_output_strides = false; // scaling factor - we do a pointwise multiplication of outputs by // this factor double scale_factor = 1.0; fft_params(){}; virtual ~fft_params(){}; // Given an array type, return the name as a string. static std::string array_type_name(const fft_array_type type, bool verbose = true) { switch(type) { case fft_array_type_complex_interleaved: return verbose ? "fft_array_type_complex_interleaved" : "CI"; case fft_array_type_complex_planar: return verbose ? "fft_array_type_complex_planar" : "CP"; case fft_array_type_real: return verbose ? "fft_array_type_real" : "R"; case fft_array_type_hermitian_interleaved: return verbose ? "fft_array_type_hermitian_interleaved" : "HI"; case fft_array_type_hermitian_planar: return verbose ? "fft_array_type_hermitian_planar" : "HP"; case fft_array_type_unset: return verbose ? "fft_array_type_unset" : "UN"; } return ""; } std::string transform_type_name() const { switch(transform_type) { case fft_transform_type_complex_forward: return "fft_transform_type_complex_forward"; case fft_transform_type_complex_inverse: return "fft_transform_type_complex_inverse"; case fft_transform_type_real_forward: return "fft_transform_type_real_forward"; case fft_transform_type_real_inverse: return "fft_transform_type_real_inverse"; default: throw std::runtime_error("Invalid transform type"); } } // Convert to string for output. std::string str(const std::string& separator = ", ") const { // top-level stride/dist are not used when fields are specified. const bool have_ifields = !ifields.empty(); const bool have_ofields = !ofields.empty(); std::stringstream ss; auto print_size_vec = [&](const char* description, const std::vector& vec) { ss << description << ":"; for(auto i : vec) ss << " " << i; ss << separator; }; auto print_fields = [&](const char* description, const std::vector& fields) { for(unsigned int fidx = 0; fidx < fields.size(); ++fidx) { const auto& f = fields[fidx]; ss << description << " " << fidx << ":" << separator; for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx) { const auto& b = f.bricks[bidx]; ss << " brick " << bidx << ":" << separator; print_size_vec(" lower", b.lower); print_size_vec(" upper", b.upper); print_size_vec(" stride", b.stride); ss << " device: " << b.device << separator; } } }; print_size_vec("length", length); if(have_ifields) { print_fields("ifield", ifields); } else { print_size_vec("istride", istride); ss << "idist: " << idist << separator; } if(have_ofields) { print_fields("ofield", ofields); } else { print_size_vec("ostride", ostride); ss << "odist: " << odist << separator; } ss << "batch: " << nbatch << separator; print_size_vec("isize", isize); print_size_vec("osize", osize); print_size_vec("ioffset", ioffset); print_size_vec("ooffset", ooffset); if(placement == fft_placement_inplace) ss << "in-place"; else ss << "out-of-place"; ss << separator; ss << "transform_type: " << transform_type_name() << separator; ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator; switch(precision) { case fft_precision_half: ss << "half-precision"; break; case fft_precision_single: ss << "single-precision"; break; case fft_precision_double: ss << "double-precision"; break; } ss << separator; print_size_vec("ilength", ilength()); print_size_vec("olength", olength()); print_size_vec("ibuffer_size", ibuffer_sizes()); print_size_vec("obuffer_size", obuffer_sizes()); if(scale_factor != 1.0) ss << "scale factor: " << scale_factor << separator; return ss.str(); } // Produce a stringified token of the test fft params. std::string token() const { std::string ret; switch(transform_type) { case fft_transform_type_complex_forward: ret += "complex_forward_"; break; case fft_transform_type_complex_inverse: ret += "complex_inverse_"; break; case fft_transform_type_real_forward: ret += "real_forward_"; break; case fft_transform_type_real_inverse: ret += "real_inverse_"; break; } auto append_size_vec = [&ret](const std::vector& vec) { for(auto s : vec) { ret += "_"; ret += std::to_string(s); } }; ret += "len"; append_size_vec(length); switch(precision) { case fft_precision_half: ret += "_half_"; break; case fft_precision_single: ret += "_single_"; break; case fft_precision_double: ret += "_double_"; break; } switch(placement) { case fft_placement_inplace: ret += "ip_"; break; case fft_placement_notinplace: ret += "op_"; break; } ret += "batch_"; ret += std::to_string(nbatch); auto append_array_type = [&ret](fft_array_type type) { switch(type) { case fft_array_type_complex_interleaved: ret += "CI"; break; case fft_array_type_complex_planar: ret += "CP"; break; case fft_array_type_real: ret += "R"; break; case fft_array_type_hermitian_interleaved: ret += "HI"; break; case fft_array_type_hermitian_planar: ret += "HP"; break; default: ret += "UN"; break; } }; auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) { ret += "_brick"; ret += "_lower"; append_size_vec(b.lower); ret += "_upper"; append_size_vec(b.upper); ret += "_stride"; append_size_vec(b.stride); ret += "_dev_"; ret += std::to_string(b.device); }; const bool have_ifields = !ifields.empty(); const bool have_ofields = !ofields.empty(); if(have_ifields) { for(const auto& f : ifields) { ret += "_ifield"; for(const auto& b : f.bricks) append_brick_info(b); } } else { ret += "_istride"; append_size_vec(istride); ret += "_"; append_array_type(itype); } if(have_ofields) { for(const auto& f : ofields) { ret += "_ofield"; for(const auto& b : f.bricks) append_brick_info(b); } } else { ret += "_ostride"; append_size_vec(ostride); ret += "_"; append_array_type(otype); } if(!have_ifields) { ret += "_idist_"; ret += std::to_string(idist); } if(!have_ofields) { ret += "_odist_"; ret += std::to_string(odist); } if(!have_ifields) { ret += "_ioffset"; append_size_vec(ioffset); } if(!have_ofields) { ret += "_ooffset"; append_size_vec(ooffset); } if(run_callbacks) ret += "_CB"; if(scale_factor != 1.0) ret += "_scale"; return ret; } // Set all params from a stringified token. void from_token(std::string token) { std::vector vals; std::string delimiter = "_"; { size_t pos = 0; while((pos = token.find(delimiter)) != std::string::npos) { auto val = token.substr(0, pos); vals.push_back(val); token.erase(0, pos + delimiter.length()); } vals.push_back(token); } auto size_parser = [](const std::vector& vals, const std::string token, size_t& pos) { if(vals[pos++] != token) throw std::runtime_error("Unable to parse token"); return std::stoull(vals[pos++]); }; auto vector_parser = [](const std::vector& vals, const std::string token, size_t& pos) { if(vals[pos++] != token) throw std::runtime_error("Unable to parse token"); std::vector vec; while(pos < vals.size()) { if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit)) { vec.push_back(std::stoull(vals[pos++])); } else { break; } } return vec; }; auto type_parser = [](const std::string& val) { if(val == "CI") return fft_array_type_complex_interleaved; else if(val == "CP") return fft_array_type_complex_planar; else if(val == "R") return fft_array_type_real; else if(val == "HI") return fft_array_type_hermitian_interleaved; else if(val == "HP") return fft_array_type_hermitian_planar; return fft_array_type_unset; }; auto field_parser = [&vector_parser, &size_parser](const std::vector& vals, size_t& pos, std::vector& output) { // skip over ifield/ofield word pos++; fft_field& f = output.emplace_back(); while(pos < vals.size() && vals[pos] == "brick") { fft_brick& b = f.bricks.emplace_back(); pos++; b.lower = vector_parser(vals, "lower", pos); b.upper = vector_parser(vals, "upper", pos); b.stride = vector_parser(vals, "stride", pos); b.device = size_parser(vals, "dev", pos); } }; size_t pos = 0; bool complex = vals[pos++] == "complex"; bool forward = vals[pos++] == "forward"; if(complex && forward) transform_type = fft_transform_type_complex_forward; if(complex && !forward) transform_type = fft_transform_type_complex_inverse; if(!complex && forward) transform_type = fft_transform_type_real_forward; if(!complex && !forward) transform_type = fft_transform_type_real_inverse; length = vector_parser(vals, "len", pos); if(vals[pos] == "half") precision = fft_precision_half; else if(vals[pos] == "single") precision = fft_precision_single; else if(vals[pos] == "double") precision = fft_precision_double; pos++; placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace; nbatch = size_parser(vals, "batch", pos); // strides, bricks etc are mixed in from here, so just keep // looking at the next token to decide what to do while(pos < vals.size()) { const auto& next_token = vals[pos]; if(next_token == "istride") { istride = vector_parser(vals, "istride", pos); itype = type_parser(vals[pos]); pos++; } else if(next_token == "ostride") { ostride = vector_parser(vals, "ostride", pos); otype = type_parser(vals[pos]); pos++; } else if(next_token == "idist") idist = size_parser(vals, "idist", pos); else if(next_token == "odist") odist = size_parser(vals, "odist", pos); else if(next_token == "ioffset") ioffset = vector_parser(vals, "ioffset", pos); else if(next_token == "ooffset") ooffset = vector_parser(vals, "ooffset", pos); else if(next_token == "ifield") field_parser(vals, pos, ifields); else if(next_token == "ofield") field_parser(vals, pos, ofields); else break; } if(pos < vals.size() && vals[pos] == "CB") { run_callbacks = true; ++pos; } if(pos < vals.size() && vals[pos] == "scale") { // just pick some factor that's not zero or one scale_factor = 0.1239; ++pos; } } // Stream output operator (for gtest, etc). friend std::ostream& operator<<(std::ostream& stream, const fft_params& params) { stream << params.str(); return stream; } // Dimension of the transform. size_t dim() const { return length.size(); } virtual std::vector ilength() const { auto ilength = length; if(transform_type == fft_transform_type_real_inverse) ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1; return ilength; } virtual std::vector olength() const { auto olength = length; if(transform_type == fft_transform_type_real_forward) olength[dim() - 1] = olength[dim() - 1] / 2 + 1; return olength; } static size_t nbuffer(const fft_array_type type) { switch(type) { case fft_array_type_real: case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: return 1; case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: return 2; case fft_array_type_unset: return 0; } return 0; } // Number of input buffers size_t nibuffer() const { return nbuffer(itype); } // Number of output buffers size_t nobuffer() const { return nbuffer(otype); } void set_iotypes() { if(itype == fft_array_type_unset) { switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: itype = fft_array_type_complex_interleaved; break; case fft_transform_type_real_forward: itype = fft_array_type_real; break; case fft_transform_type_real_inverse: itype = fft_array_type_hermitian_interleaved; break; default: throw std::runtime_error("Invalid transform type"); } } if(otype == fft_array_type_unset) { switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: otype = fft_array_type_complex_interleaved; break; case fft_transform_type_real_forward: otype = fft_array_type_hermitian_interleaved; break; case fft_transform_type_real_inverse: otype = fft_array_type_real; break; default: throw std::runtime_error("Invalid transform type"); } } } // Check that the input and output types are consistent. bool check_iotypes() const { switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_complex_planar: case fft_array_type_hermitian_interleaved: case fft_array_type_hermitian_planar: case fft_array_type_real: break; default: throw std::runtime_error("Invalid Input array type format"); } switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_complex_planar: case fft_array_type_hermitian_interleaved: case fft_array_type_hermitian_planar: case fft_array_type_real: break; default: throw std::runtime_error("Invalid Input array type format"); } // Check that format choices are supported if(transform_type != fft_transform_type_real_forward && transform_type != fft_transform_type_real_inverse) { if(placement == fft_placement_inplace && itype != otype) { throw std::runtime_error( "In-place transforms must have identical input and output types"); } } bool okformat = true; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_complex_planar: okformat = (otype == fft_array_type_complex_interleaved || otype == fft_array_type_complex_planar); break; case fft_array_type_hermitian_interleaved: case fft_array_type_hermitian_planar: okformat = otype == fft_array_type_real; break; case fft_array_type_real: okformat = (otype == fft_array_type_hermitian_interleaved || otype == fft_array_type_hermitian_planar); break; default: throw std::runtime_error("Invalid Input array type format"); } return okformat; } // Given a length vector, set the rest of the strides. // The optional argument stride0 sets the stride for the contiguous dimension. // The optional rcpadding argument sets the stride correctly for in-place // multi-dimensional real/complex transforms. // Format is row-major. template std::vector compute_stride(const std::vector& length, const std::vector& stride0 = std::vector(), const bool rcpadding = false) const { std::vector stride(dim()); size_t dimoffset = 0; if(stride0.size() == 0) { // Set the contiguous stride: stride[dim() - 1] = 1; dimoffset = 1; } else { // Copy the input values to the end of the stride array: for(size_t i = 0; i < stride0.size(); ++i) { stride[dim() - stride0.size() + i] = stride0[i]; } } if(stride0.size() < dim()) { // Compute any remaining values via recursion. for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;) { auto lengthip1 = length[i + 1]; if(rcpadding && i == dim() - 2) { lengthip1 = 2 * (lengthip1 / 2 + 1); } stride[i] = stride[i + 1] * lengthip1; } } return stride; } void compute_istride() { istride = compute_stride(ilength(), istride, placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward); } void compute_ostride() { ostride = compute_stride(olength(), ostride, placement == fft_placement_inplace && transform_type == fft_transform_type_real_inverse); } virtual void compute_isize() { auto il = ilength(); size_t val = compute_ptrdiff(il, istride, nbatch, idist); isize.resize(nibuffer()); for(unsigned int i = 0; i < isize.size(); ++i) { isize[i] = val + ioffset[i]; } } virtual void compute_osize() { auto ol = olength(); size_t val = compute_ptrdiff(ol, ostride, nbatch, odist); osize.resize(nobuffer()); for(unsigned int i = 0; i < osize.size(); ++i) { osize[i] = val + ooffset[i]; } } std::vector ibuffer_sizes() const { std::vector ibuffer_sizes; // In-place real-to-complex transforms need to have enough space in the input buffer to // accomadate the output, which is slightly larger. if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward) { return obuffer_sizes(); } if(isize.empty()) return ibuffer_sizes; switch(itype) { case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: ibuffer_sizes.resize(2); break; default: ibuffer_sizes.resize(1); } for(unsigned i = 0; i < ibuffer_sizes.size(); i++) { ibuffer_sizes[i] = isize[i] * var_size(precision, itype); } return ibuffer_sizes; } virtual std::vector obuffer_sizes() const { std::vector obuffer_sizes; if(osize.empty()) return obuffer_sizes; switch(otype) { case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: obuffer_sizes.resize(2); break; default: obuffer_sizes.resize(1); } for(unsigned i = 0; i < obuffer_sizes.size(); i++) { obuffer_sizes[i] = osize[i] * var_size(precision, otype); } return obuffer_sizes; } // Compute the idist for a given transform based on the placeness, transform type, and data // layout. size_t compute_idist() const { size_t dist = 0; // In-place 1D transforms need extra dist. if(transform_type == fft_transform_type_real_forward && dim() == 1 && placement == fft_placement_inplace) { dist = 2 * (length[0] / 2 + 1) * istride[0]; return dist; } if(transform_type == fft_transform_type_real_inverse && dim() == 1) { dist = (length[0] / 2 + 1) * istride[0]; return dist; } dist = (transform_type == fft_transform_type_real_inverse) ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1] : length[dim() - 1] * istride[dim() - 1]; for(unsigned int i = 0; i < dim() - 1; ++i) { dist = std::max(length[i] * istride[i], dist); } return dist; } void set_idist() { if(idist != 0) return; idist = compute_idist(); } // Compute the odist for a given transform based on the placeness, transform type, and data // layout. Row-major. size_t compute_odist() const { size_t dist = 0; // In-place 1D transforms need extra dist. if(transform_type == fft_transform_type_real_inverse && dim() == 1 && placement == fft_placement_inplace) { dist = 2 * (length[0] / 2 + 1) * ostride[0]; return dist; } if(transform_type == fft_transform_type_real_forward && dim() == 1) { dist = (length[0] / 2 + 1) * ostride[0]; return dist; } dist = (transform_type == fft_transform_type_real_forward) ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1] : length[dim() - 1] * ostride[dim() - 1]; for(unsigned int i = 0; i < dim() - 1; ++i) { dist = std::max(length[i] * ostride[i], dist); } return dist; } void set_odist() { if(odist != 0) return; odist = compute_odist(); } // Put the length, stride, batch, and dist into a single length/stride array and pass off to the // validity checker. bool valid_length_stride_batch_dist(const std::vector& l0, const std::vector& s0, const size_t n, const size_t dist, const int verbose = 0) const { if(l0.size() != s0.size()) return false; // Length and stride vectors, including bathes: std::vector l{}, s{}; for(unsigned int i = 0; i < l0.size(); ++i) { if(l0[i] > 1) { if(s0[i] == 0) return false; l.push_back(l0[i]); s.push_back(s0[i]); } } if(n > 1) { if(dist == 0) return false; l.push_back(n); s.push_back(dist); } return array_valid(l, s, verbose); } // Return true if the given GPU parameters would produce a valid transform. bool valid(const int verbose) const { if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer()) return false; // Check that in-place transforms have the same input and output stride: if(placement == fft_placement_inplace) { const auto stridesize = std::min(istride.size(), ostride.size()); bool samestride = true; for(unsigned int i = 0; i < stridesize; ++i) { if(istride[i] != ostride[i]) samestride = false; } if((transform_type == fft_transform_type_complex_forward || transform_type == fft_transform_type_complex_inverse) && !samestride) { // In-place transforms require identical input and output strides. if(verbose) { std::cout << "istride:"; for(const auto& i : istride) std::cout << " " << i; std::cout << " ostride0:"; for(const auto& i : ostride) std::cout << " " << i; std::cout << " differ; skipped for in-place transforms: skipping test" << std::endl; } return false; } if((transform_type == fft_transform_type_complex_forward || transform_type == fft_transform_type_complex_inverse) && (idist != odist) && nbatch > 1) { // In-place transforms require identical distance, if // batch > 1. If batch is 1 then dist is ignored and // the FFT should still work. if(verbose) { std::cout << "idist:" << idist << " odist:" << odist << " differ; skipped for in-place transforms: skipping test" << std::endl; } return false; } if((transform_type == fft_transform_type_real_forward || transform_type == fft_transform_type_real_inverse) && (istride.back() != 1 || ostride.back() != 1)) { // In-place real/complex transforms require unit strides. if(verbose) { std::cout << "istride.back(): " << istride.back() << " ostride.back(): " << ostride.back() << " must be unitary for in-place real/complex transforms: skipping test" << std::endl; } return false; } if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) || (itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)) { if(verbose) { std::cout << "In-place c2c transforms require identical io types; skipped.\n"; } return false; } // Check offsets switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: for(unsigned int i = 0; i < nibuffer(); ++i) { if(ioffset[i] != ooffset[i]) return false; } break; case fft_transform_type_real_forward: if(ioffset[0] != 2 * ooffset[0]) return false; break; case fft_transform_type_real_inverse: if(2 * ioffset[0] != ooffset[0]) return false; break; } } if(!check_iotypes()) return false; // we can only check output strides on out-of-place // transforms, since we need to initialize output to a known // pattern if(placement == fft_placement_inplace && check_output_strides) return false; // Check input and output strides if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true) { if(verbose) std::cout << "Invalid input data format.\n"; return false; } if(!(ilength() == olength() && istride == ostride && idist == odist)) { // Only check if different if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true) { if(verbose) std::cout << "Invalid output data format.\n"; return false; } } // The parameters are valid. return true; } // Fill in any missing parameters. void validate() { set_iotypes(); compute_istride(); compute_ostride(); set_idist(); set_odist(); compute_isize(); compute_osize(); validate_fields(); } virtual void validate_fields() const { if(!ifields.empty() || !ofields.empty()) throw std::runtime_error("input/output fields are unsupported"); } // Column-major getters: std::vector length_cm() const { auto length_cm = length; std::reverse(std::begin(length_cm), std::end(length_cm)); return length_cm; } std::vector ilength_cm() const { auto ilength_cm = ilength(); std::reverse(std::begin(ilength_cm), std::end(ilength_cm)); return ilength_cm; } std::vector olength_cm() const { auto olength_cm = olength(); std::reverse(std::begin(olength_cm), std::end(olength_cm)); return olength_cm; } std::vector istride_cm() const { auto istride_cm = istride; std::reverse(std::begin(istride_cm), std::end(istride_cm)); return istride_cm; } std::vector ostride_cm() const { auto ostride_cm = ostride; std::reverse(std::begin(ostride_cm), std::end(ostride_cm)); return ostride_cm; } bool is_planar() const { if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar) return true; if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar) return true; return false; } // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary. template inline void compute_input(std::vector& input) { auto deviceProp = get_curr_device_prop(); switch(precision) { case fft_precision_half: set_input( input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); break; case fft_precision_double: set_input( input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); break; case fft_precision_single: set_input( input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); break; } } template void print_ibuffer(const std::vector& buf, Tstream& stream = std::cout) const { switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: { buffer_printer> s; s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); break; } case fft_precision_single: { buffer_printer> s; s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); break; } case fft_precision_double: { buffer_printer> s; s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); break; } } break; } case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: case fft_array_type_real: { switch(precision) { case fft_precision_half: { buffer_printer<_Float16> s; s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); break; } case fft_precision_single: { buffer_printer s; s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); break; } case fft_precision_double: { buffer_printer s; s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); break; } } break; } default: throw std::runtime_error("Invalid itype in print_ibuffer"); } } template void print_obuffer(const std::vector& buf, Tstream& stream = std::cout) const { switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: { buffer_printer> s; s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); break; } case fft_precision_single: { buffer_printer> s; s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); break; } case fft_precision_double: buffer_printer> s; s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); break; } break; } case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: case fft_array_type_real: { switch(precision) { case fft_precision_half: { buffer_printer<_Float16> s; s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); break; } case fft_precision_single: { buffer_printer s; s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); break; } case fft_precision_double: { buffer_printer s; s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); break; } } break; } default: throw std::runtime_error("Invalid itype in print_obuffer"); } } void print_ibuffer_flat(const std::vector& buf) const { switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: { buffer_printer> s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_single: { buffer_printer> s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_double: buffer_printer> s; s.print_buffer_flat(buf, osize, ooffset); break; } break; } case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: case fft_array_type_real: { switch(precision) { case fft_precision_half: { buffer_printer<_Float16> s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_single: { buffer_printer s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_double: { buffer_printer s; s.print_buffer_flat(buf, osize, ooffset); break; } } break; default: throw std::runtime_error("Invalid itype in print_ibuffer_flat"); } } } void print_obuffer_flat(const std::vector& buf) const { switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: { buffer_printer> s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_single: { buffer_printer> s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_double: buffer_printer> s; s.print_buffer_flat(buf, osize, ooffset); break; } break; } case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: case fft_array_type_real: { switch(precision) { case fft_precision_half: { buffer_printer<_Float16> s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_single: { buffer_printer s; s.print_buffer_flat(buf, osize, ooffset); break; } case fft_precision_double: { buffer_printer s; s.print_buffer_flat(buf, osize, ooffset); break; } } break; default: throw std::runtime_error("Invalid itype in print_ibuffer_flat"); } } } virtual fft_status set_callbacks(void* load_cb_host, void* load_cb_data, void* store_cb_host, void* store_cb_data) { return fft_status_success; } virtual fft_status execute(void** in, void** out) { return fft_status_success; }; size_t fft_params_vram_footprint() { return fft_params::vram_footprint(); } virtual size_t vram_footprint() { const auto ibuf_size = ibuffer_sizes(); size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1); if(placement == fft_placement_notinplace) { const auto obuf_size = obuffer_sizes(); val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1); } return val; } // Specific exception type for work buffer allocation failure. // Tests that hit this can't fit on the GPU and should be skipped. struct work_buffer_alloc_failure : public std::runtime_error { work_buffer_alloc_failure(const std::string& s) : std::runtime_error(s) { } }; virtual fft_status create_plan() { return fft_status_success; } // Change a forward transform to it's inverse void inverse_from_forward(fft_params& params_forward) { switch(params_forward.transform_type) { case fft_transform_type_complex_forward: transform_type = fft_transform_type_complex_inverse; break; case fft_transform_type_real_forward: transform_type = fft_transform_type_real_inverse; break; default: throw std::runtime_error("Transform type not forward."); } length = params_forward.length; istride = params_forward.ostride; ostride = params_forward.istride; nbatch = params_forward.nbatch; precision = params_forward.precision; placement = params_forward.placement; idist = params_forward.odist; odist = params_forward.idist; itype = params_forward.otype; otype = params_forward.itype; ioffset = params_forward.ooffset; ooffset = params_forward.ioffset; run_callbacks = params_forward.run_callbacks; check_output_strides = params_forward.check_output_strides; scale_factor = 1 / params_forward.scale_factor; } // prepare for multi-GPU transform. Generated input is in ibuffer. // pibuffer, pobuffer are the pointers that will be passed to the // FFT library's "execute" API. virtual void multi_gpu_prepare(std::vector& ibuffer, std::vector& pibuffer, std::vector& pobuffer) { } // finalize multi-GPU transform. pobuffers are the pointers // provided to the FFT library's "execute" API. obuffer is the // buffer where transform output needs to go for validation virtual void multi_gpu_finalize(std::vector& obuffer, std::vector& pobuffer) {} // create bricks in the specified field for the specified number // of devices. The field is split along the highest FFT // dimension, and the length only includes FFT lengths, not batch // dimension. void distribute_field(int deviceCount, std::vector& fields, const std::vector& field_length) { size_t slowLen = field_length.front(); if(slowLen < static_cast(deviceCount)) throw std::runtime_error("too many devices to distribute length " + std::to_string(slowLen)); auto& field = fields.emplace_back(); for(int i = 0; i < deviceCount; ++i) { // start at origin std::vector field_lower(field_length.size()); std::vector field_upper(field_length.size()); // note: slowest FFT dim is index 0 in these coordinates field_lower[0] = slowLen / deviceCount * i; // last brick needs to include the whole slow len if(i == deviceCount - 1) { field_upper[0] = slowLen; } else { field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount); } for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim) { field_upper[upperDim] = field_length[upperDim]; } // field coordinates also need to include batch field_lower.insert(field_lower.begin(), 0); field_upper.insert(field_upper.begin(), nbatch); // bricks have contiguous strides size_t brick_dist = 1; std::vector brick_stride(field_lower.size()); for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx) { // fill strides from fastest to slowest *(brick_stride.rbegin() + distIdx) = brick_dist; brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx); } field.bricks.push_back( fft_params::fft_brick{field_lower, field_upper, brick_stride, i}); } } void distribute_input(int deviceCount) { distribute_field(deviceCount, ifields, length); } void distribute_output(int deviceCount) { distribute_field(deviceCount, ofields, olength()); } }; // This is used with the program_options class so that the user can type an integer on the // command line and we store into an enum varaible template std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, fft_array_type& atype) { unsigned tmp; stream >> tmp; atype = fft_array_type(tmp); return stream; } // similarly for transform type template std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, fft_transform_type& ttype) { unsigned tmp; stream >> tmp; ttype = fft_transform_type(tmp); return stream; } // Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths template std::vector> partition_colmajor(const T1& length) { return partition_base(length, compute_partition_count(length)); } // Partition on the rightmost part of the tuple, for col-major indexing template std::vector, std::tuple>> partition_colmajor(const std::tuple& length) { auto partitions = partition_base(std::get<1>(length), compute_partition_count(length)); std::vector, std::tuple>> ret(partitions.size()); for(size_t i = 0; i < partitions.size(); ++i) { std::get<1>(ret[i].first) = partitions[i].first; std::get<0>(ret[i].first) = 0; std::get<1>(ret[i].second) = partitions[i].second; std::get<0>(ret[i].second) = std::get<0>(length); } return ret; } template std::vector, std::tuple>> partition_colmajor(const std::tuple& length) { auto partitions = partition_base(std::get<2>(length), compute_partition_count(length)); std::vector, std::tuple>> ret(partitions.size()); for(size_t i = 0; i < partitions.size(); ++i) { std::get<2>(ret[i].first) = partitions[i].first; std::get<1>(ret[i].first) = 0; std::get<0>(ret[i].first) = 0; std::get<2>(ret[i].second) = partitions[i].second; std::get<1>(ret[i].second) = std::get<1>(length); std::get<0>(ret[i].second) = std::get<0>(length); } return ret; } // Copy data of dimensions length with strides istride and length idist between batches to // a buffer with strides ostride and length odist between batches. The input and output // types are identical. template inline void copy_buffers_1to1(const Tval* input, Tval* output, const Tint1& whole_length, const size_t nbatch, const Tint2& istride, const size_t idist, const Tint3& ostride, const size_t odist, const std::vector& ioffset, const std::vector& ooffset) { const bool idx_equals_odx = istride == ostride && idist == odist; size_t idx_base = 0; size_t odx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) { #ifdef _OPENMP #pragma omp parallel for num_threads(partitions.size()) #endif for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); output[odx + ooffset[0]] = input[idx + ioffset[0]]; } while(increment_rowmajor(index, length)); } } } // Copy data of dimensions length with strides istride and length idist between batches to // a buffer with strides ostride and length odist between batches. The input type is // planar and the output type is complex interleaved. template inline void copy_buffers_2to1(const Tval* input0, const Tval* input1, rocfft_complex* output, const Tint1& whole_length, const size_t nbatch, const Tint2& istride, const size_t idist, const Tint3& ostride, const size_t odist, const std::vector& ioffset, const std::vector& ooffset) { const bool idx_equals_odx = istride == ostride && idist == odist; size_t idx_base = 0; size_t odx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) { #ifdef _OPENMP #pragma omp parallel for num_threads(partitions.size()) #endif for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); output[odx + ooffset[0]] = rocfft_complex(input0[idx + ioffset[0]], input1[idx + ioffset[1]]); } while(increment_rowmajor(index, length)); } } } // Copy data of dimensions length with strides istride and length idist between batches to // a buffer with strides ostride and length odist between batches. The input type is // complex interleaved and the output type is planar. template inline void copy_buffers_1to2(const rocfft_complex* input, Tval* output0, Tval* output1, const Tint1& whole_length, const size_t nbatch, const Tint2& istride, const size_t idist, const Tint3& ostride, const size_t odist, const std::vector& ioffset, const std::vector& ooffset) { const bool idx_equals_odx = istride == ostride && idist == odist; size_t idx_base = 0; size_t odx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) { #ifdef _OPENMP #pragma omp parallel for num_threads(partitions.size()) #endif for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); output0[odx + ooffset[0]] = input[idx + ioffset[0]].real(); output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag(); } while(increment_rowmajor(index, length)); } } } // Copy data of dimensions length with strides istride and length idist between batches to // a buffer with strides ostride and length odist between batches. The input type given // by itype, and the output type is given by otype. template inline void copy_buffers(const std::vector& input, std::vector& output, const Tint1& length, const size_t nbatch, const fft_precision precision, const fft_array_type itype, const Tint2& istride, const size_t idist, const fft_array_type otype, const Tint3& ostride, const size_t odist, const std::vector& ioffset, const std::vector& ooffset) { if(itype == otype) { switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: switch(precision) { case fft_precision_half: copy_buffers_1to1( reinterpret_cast*>(input[0].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_single: copy_buffers_1to1(reinterpret_cast*>(input[0].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_double: copy_buffers_1to1(reinterpret_cast*>(input[0].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; } break; case fft_array_type_real: case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: for(unsigned int idx = 0; idx < input.size(); ++idx) { switch(precision) { case fft_precision_half: copy_buffers_1to1(reinterpret_cast(input[idx].data()), reinterpret_cast<_Float16*>(output[idx].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_single: copy_buffers_1to1(reinterpret_cast(input[idx].data()), reinterpret_cast(output[idx].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_double: copy_buffers_1to1(reinterpret_cast(input[idx].data()), reinterpret_cast(output[idx].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; } } break; default: throw std::runtime_error("Invalid data type"); } } else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) || (itype == fft_array_type_hermitian_interleaved && otype == fft_array_type_hermitian_planar)) { // copy 1to2 switch(precision) { case fft_precision_half: copy_buffers_1to2(reinterpret_cast*>(input[0].data()), reinterpret_cast<_Float16*>(output[0].data()), reinterpret_cast<_Float16*>(output[1].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_single: copy_buffers_1to2(reinterpret_cast*>(input[0].data()), reinterpret_cast(output[0].data()), reinterpret_cast(output[1].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_double: copy_buffers_1to2(reinterpret_cast*>(input[0].data()), reinterpret_cast(output[0].data()), reinterpret_cast(output[1].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; } } else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) || (itype == fft_array_type_hermitian_planar && otype == fft_array_type_hermitian_interleaved)) { // copy 2 to 1 switch(precision) { case fft_precision_half: copy_buffers_2to1(reinterpret_cast(input[0].data()), reinterpret_cast(input[1].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_single: copy_buffers_2to1(reinterpret_cast(input[0].data()), reinterpret_cast(input[1].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; case fft_precision_double: copy_buffers_2to1(reinterpret_cast(input[0].data()), reinterpret_cast(input[1].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, ioffset, ooffset); break; } } else { throw std::runtime_error("Invalid input and output types."); } } // unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions template inline void copy_buffers(const std::vector& input, std::vector& output, const std::vector& length, const size_t nbatch, const fft_precision precision, const fft_array_type itype, const std::vector& istride, const size_t idist, const fft_array_type otype, const std::vector& ostride, const size_t odist, const std::vector& ioffset, const std::vector& ooffset) { switch(length.size()) { case 1: return copy_buffers(input, output, length[0], nbatch, precision, itype, istride[0], idist, otype, ostride[0], odist, ioffset, ooffset); case 2: return copy_buffers(input, output, std::make_tuple(length[0], length[1]), nbatch, precision, itype, std::make_tuple(istride[0], istride[1]), idist, otype, std::make_tuple(ostride[0], ostride[1]), odist, ioffset, ooffset); case 3: return copy_buffers(input, output, std::make_tuple(length[0], length[1], length[2]), nbatch, precision, itype, std::make_tuple(istride[0], istride[1], istride[2]), idist, otype, std::make_tuple(ostride[0], ostride[1], ostride[2]), odist, ioffset, ooffset); default: abort(); } } // Compute the L-infinity and L-2 distance between two buffers with strides istride and // length idist between batches to a buffer with strides ostride and length odist between // batches. Both buffers are of complex type. struct VectorNorms { double l_2 = 0.0, l_inf = 0.0; }; template inline VectorNorms distance_1to1_complex(const Tcomplex* input, const Tcomplex* output, const Tint1& whole_length, const size_t nbatch, const Tint2& istride, const size_t idist, const Tint3& ostride, const size_t odist, std::vector>* linf_failures, const double linf_cutoff, const std::vector& ioffset, const std::vector& ooffset, const double output_scalar = 1.0) { double linf = 0.0; double l2 = 0.0; std::mutex linf_failure_lock; std::vector> linf_failures_private; const bool idx_equals_odx = istride == ostride && idist == odist; size_t idx_base = 0; size_t odx_base = 0; auto partitions = partition_colmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) { #ifdef _OPENMP #pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) #endif for(size_t part = 0; part < partitions.size(); ++part) { double cur_linf = 0.0; double cur_l2 = 0.0; auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); const double rdiff = std::abs(static_cast(output[odx + ooffset[0]].real()) * output_scalar - static_cast(input[idx + ioffset[0]].real())); cur_linf = std::max(rdiff, cur_linf); if(cur_linf > linf_cutoff) { std::pair fval(b, idx); if(linf_failures) linf_failures_private.push_back(fval); } cur_l2 += rdiff * rdiff; const double idiff = std::abs(static_cast(output[odx + ooffset[0]].imag()) * output_scalar - static_cast(input[idx + ioffset[0]].imag())); cur_linf = std::max(idiff, cur_linf); if(cur_linf > linf_cutoff) { std::pair fval(b, idx); if(linf_failures) linf_failures_private.push_back(fval); } cur_l2 += idiff * idiff; } while(increment_rowmajor(index, length)); linf = std::max(linf, cur_linf); l2 += cur_l2; if(linf_failures) { linf_failure_lock.lock(); std::copy(linf_failures_private.begin(), linf_failures_private.end(), std::back_inserter(*linf_failures)); linf_failure_lock.unlock(); } } } return {.l_2 = sqrt(l2), .l_inf = linf}; } // Compute the L-infinity and L-2 distance between two buffers with strides istride and // length idist between batches to a buffer with strides ostride and length odist between // batches. Both buffers are of real type. template inline VectorNorms distance_1to1_real(const Tfloat* input, const Tfloat* output, const Tint1& whole_length, const size_t nbatch, const Tint2& istride, const size_t idist, const Tint3& ostride, const size_t odist, std::vector>* linf_failures, const double linf_cutoff, const std::vector& ioffset, const std::vector& ooffset, const double output_scalar = 1.0) { double linf = 0.0; double l2 = 0.0; std::mutex linf_failure_lock; std::vector> linf_failures_private; const bool idx_equals_odx = istride == ostride && idist == odist; size_t idx_base = 0; size_t odx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) { #ifdef _OPENMP #pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) #endif for(size_t part = 0; part < partitions.size(); ++part) { double cur_linf = 0.0; double cur_l2 = 0.0; auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); const double diff = std::abs(static_cast(output[odx + ooffset[0]]) * output_scalar - static_cast(input[idx + ioffset[0]])); cur_linf = std::max(diff, cur_linf); if(cur_linf > linf_cutoff) { std::pair fval(b, idx); if(linf_failures) linf_failures_private.push_back(fval); } cur_l2 += diff * diff; } while(increment_rowmajor(index, length)); linf = std::max(linf, cur_linf); l2 += cur_l2; if(linf_failures) { linf_failure_lock.lock(); std::copy(linf_failures_private.begin(), linf_failures_private.end(), std::back_inserter(*linf_failures)); linf_failure_lock.unlock(); } } } return {.l_2 = sqrt(l2), .l_inf = linf}; } // Compute the L-infinity and L-2 distance between two buffers with strides istride and // length idist between batches to a buffer with strides ostride and length odist between // batches. input is complex-interleaved, output is complex-planar. template inline VectorNorms distance_1to2(const rocfft_complex* input, const Tval* output0, const Tval* output1, const Tint1& whole_length, const size_t nbatch, const T2& istride, const size_t idist, const T3& ostride, const size_t odist, std::vector>* linf_failures, const double linf_cutoff, const std::vector& ioffset, const std::vector& ooffset, const double output_scalar = 1.0) { double linf = 0.0; double l2 = 0.0; std::mutex linf_failure_lock; std::vector> linf_failures_private; const bool idx_equals_odx = istride == ostride && idist == odist; size_t idx_base = 0; size_t odx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) { #ifdef _OPENMP #pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) #endif for(size_t part = 0; part < partitions.size(); ++part) { double cur_linf = 0.0; double cur_l2 = 0.0; auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); const double rdiff = std::abs(static_cast(output0[odx + ooffset[0]]) * output_scalar - static_cast(input[idx + ioffset[0]].real())); cur_linf = std::max(rdiff, cur_linf); if(cur_linf > linf_cutoff) { std::pair fval(b, idx); if(linf_failures) linf_failures_private.push_back(fval); } cur_l2 += rdiff * rdiff; const double idiff = std::abs(static_cast(output1[odx + ooffset[1]]) * output_scalar - static_cast(input[idx + ioffset[0]].imag())); cur_linf = std::max(idiff, cur_linf); if(cur_linf > linf_cutoff) { std::pair fval(b, idx); if(linf_failures) linf_failures_private.push_back(fval); } cur_l2 += idiff * idiff; } while(increment_rowmajor(index, length)); linf = std::max(linf, cur_linf); l2 += cur_l2; if(linf_failures) { linf_failure_lock.lock(); std::copy(linf_failures_private.begin(), linf_failures_private.end(), std::back_inserter(*linf_failures)); linf_failure_lock.unlock(); } } } return {.l_2 = sqrt(l2), .l_inf = linf}; } // Compute the L-inifnity and L-2 distance between two buffers of dimension length and // with types given by itype, otype, and precision. template inline VectorNorms distance(const std::vector& input, const std::vector& output, const Tint1& length, const size_t nbatch, const fft_precision precision, const fft_array_type itype, const Tint2& istride, const size_t idist, const fft_array_type otype, const Tint3& ostride, const size_t odist, std::vector>* linf_failures, const double linf_cutoff, const std::vector& ioffset, const std::vector& ooffset, const double output_scalar = 1.0) { VectorNorms dist; if(itype == otype) { switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: switch(precision) { case fft_precision_half: dist = distance_1to1_complex( reinterpret_cast*>(input[0].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_single: dist = distance_1to1_complex( reinterpret_cast*>(input[0].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_double: dist = distance_1to1_complex( reinterpret_cast*>(input[0].data()), reinterpret_cast*>(output[0].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; } dist.l_2 *= dist.l_2; break; case fft_array_type_real: case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: for(unsigned int idx = 0; idx < input.size(); ++idx) { VectorNorms d; switch(precision) { case fft_precision_half: d = distance_1to1_real(reinterpret_cast(input[idx].data()), reinterpret_cast(output[idx].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_single: d = distance_1to1_real(reinterpret_cast(input[idx].data()), reinterpret_cast(output[idx].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_double: d = distance_1to1_real(reinterpret_cast(input[idx].data()), reinterpret_cast(output[idx].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; } dist.l_inf = std::max(d.l_inf, dist.l_inf); dist.l_2 += d.l_2 * d.l_2; } break; default: throw std::runtime_error("Invalid input and output types."); } } else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) || (itype == fft_array_type_hermitian_interleaved && otype == fft_array_type_hermitian_planar)) { switch(precision) { case fft_precision_half: dist = distance_1to2(reinterpret_cast*>(input[0].data()), reinterpret_cast(output[0].data()), reinterpret_cast(output[1].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_single: dist = distance_1to2(reinterpret_cast*>(input[0].data()), reinterpret_cast(output[0].data()), reinterpret_cast(output[1].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_double: dist = distance_1to2(reinterpret_cast*>(input[0].data()), reinterpret_cast(output[0].data()), reinterpret_cast(output[1].data()), length, nbatch, istride, idist, ostride, odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; } dist.l_2 *= dist.l_2; } else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) || (itype == fft_array_type_hermitian_planar && otype == fft_array_type_hermitian_interleaved)) { switch(precision) { case fft_precision_half: dist = distance_1to2(reinterpret_cast*>(output[0].data()), reinterpret_cast(input[0].data()), reinterpret_cast(input[1].data()), length, nbatch, ostride, odist, istride, idist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_single: dist = distance_1to2(reinterpret_cast*>(output[0].data()), reinterpret_cast(input[0].data()), reinterpret_cast(input[1].data()), length, nbatch, ostride, odist, istride, idist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; case fft_precision_double: dist = distance_1to2(reinterpret_cast*>(output[0].data()), reinterpret_cast(input[0].data()), reinterpret_cast(input[1].data()), length, nbatch, ostride, odist, istride, idist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); break; } dist.l_2 *= dist.l_2; } else { throw std::runtime_error("Invalid input and output types."); } dist.l_2 = sqrt(dist.l_2); return dist; } // check if the specified length + stride/dist is contiguous template bool is_contiguous_rowmajor(const std::vector& length, const std::vector& stride, size_t dist) { size_t expected_stride = 1; auto stride_it = stride.rbegin(); auto length_it = length.rbegin(); for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it) { if(*stride_it != expected_stride) return false; expected_stride *= *length_it; } return expected_stride == dist; } // Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions template inline VectorNorms distance(const std::vector& input, const std::vector& output, std::vector length, size_t nbatch, const fft_precision precision, const fft_array_type itype, std::vector istride, const size_t idist, const fft_array_type otype, std::vector ostride, const size_t odist, std::vector>* linf_failures, const double linf_cutoff, const std::vector& ioffset, const std::vector& ooffset, const double output_scalar = 1.0) { // If istride and ostride are both contiguous, collapse them down // to one dimension. Index calculation is simpler (and faster) // in the 1D case. if(is_contiguous_rowmajor(length, istride, idist) && is_contiguous_rowmajor(length, ostride, odist)) { length = {product(length.begin(), length.end()) * nbatch}; istride = {static_cast(1)}; ostride = {static_cast(1)}; nbatch = 1; } switch(length.size()) { case 1: return distance(input, output, length[0], nbatch, precision, itype, istride[0], idist, otype, ostride[0], odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); case 2: return distance(input, output, std::make_tuple(length[0], length[1]), nbatch, precision, itype, std::make_tuple(istride[0], istride[1]), idist, otype, std::make_tuple(ostride[0], ostride[1]), odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); case 3: return distance(input, output, std::make_tuple(length[0], length[1], length[2]), nbatch, precision, itype, std::make_tuple(istride[0], istride[1], istride[2]), idist, otype, std::make_tuple(ostride[0], ostride[1], ostride[2]), odist, linf_failures, linf_cutoff, ioffset, ooffset, output_scalar); default: abort(); } } // Compute the L-infinity and L-2 norm of a buffer with strides istride and // length idist. Data is rocfft_complex. template inline VectorNorms norm_complex(const Tcomplex* input, const T1& whole_length, const size_t nbatch, const T2& istride, const size_t idist, const std::vector& offset) { double linf = 0.0; double l2 = 0.0; size_t idx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist) { #ifdef _OPENMP #pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) #endif for(size_t part = 0; part < partitions.size(); ++part) { double cur_linf = 0.0; double cur_l2 = 0.0; auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const double rval = std::abs(static_cast(input[idx + offset[0]].real())); cur_linf = std::max(rval, cur_linf); cur_l2 += rval * rval; const double ival = std::abs(static_cast(input[idx + offset[0]].imag())); cur_linf = std::max(ival, cur_linf); cur_l2 += ival * ival; } while(increment_rowmajor(index, length)); linf = std::max(linf, cur_linf); l2 += cur_l2; } } return {.l_2 = sqrt(l2), .l_inf = linf}; } // Compute the L-infinity and L-2 norm of abuffer with strides istride and // length idist. Data is real-valued. template inline VectorNorms norm_real(const Tfloat* input, const T1& whole_length, const size_t nbatch, const T2& istride, const size_t idist, const std::vector& offset) { double linf = 0.0; double l2 = 0.0; size_t idx_base = 0; auto partitions = partition_rowmajor(whole_length); for(size_t b = 0; b < nbatch; b++, idx_base += idist) { #ifdef _OPENMP #pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) #endif for(size_t part = 0; part < partitions.size(); ++part) { double cur_linf = 0.0; double cur_l2 = 0.0; auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto idx = compute_index(index, istride, idx_base); const double val = std::abs(static_cast(input[idx + offset[0]])); cur_linf = std::max(val, cur_linf); cur_l2 += val * val; } while(increment_rowmajor(index, length)); linf = std::max(linf, cur_linf); l2 += cur_l2; } } return {.l_2 = sqrt(l2), .l_inf = linf}; } // Compute the L-infinity and L-2 norm of abuffer with strides istride and // length idist. Data format is given by precision and itype. template inline VectorNorms norm(const std::vector& input, const T1& length, const size_t nbatch, const fft_precision precision, const fft_array_type itype, const T2& istride, const size_t idist, const std::vector& offset) { VectorNorms norm; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: switch(precision) { case fft_precision_half: norm = norm_complex(reinterpret_cast*>(input[0].data()), length, nbatch, istride, idist, offset); break; case fft_precision_single: norm = norm_complex(reinterpret_cast*>(input[0].data()), length, nbatch, istride, idist, offset); break; case fft_precision_double: norm = norm_complex(reinterpret_cast*>(input[0].data()), length, nbatch, istride, idist, offset); break; } norm.l_2 *= norm.l_2; break; case fft_array_type_real: case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: for(unsigned int idx = 0; idx < input.size(); ++idx) { VectorNorms n; switch(precision) { case fft_precision_half: n = norm_real(reinterpret_cast(input[idx].data()), length, nbatch, istride, idist, offset); break; case fft_precision_single: n = norm_real(reinterpret_cast(input[idx].data()), length, nbatch, istride, idist, offset); break; case fft_precision_double: n = norm_real(reinterpret_cast(input[idx].data()), length, nbatch, istride, idist, offset); break; } norm.l_inf = std::max(n.l_inf, norm.l_inf); norm.l_2 += n.l_2 * n.l_2; } break; default: throw std::runtime_error("Invalid data type"); } norm.l_2 = sqrt(norm.l_2); return norm; } // Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions template inline VectorNorms norm(const std::vector& input, std::vector length, size_t nbatch, const fft_precision precision, const fft_array_type type, std::vector stride, const size_t dist, const std::vector& offset) { // If stride is contiguous, collapse it down to one dimension. // Index calculation is simpler (and faster) in the 1D case. if(is_contiguous_rowmajor(length, stride, dist)) { length = {product(length.begin(), length.end()) * nbatch}; stride = {static_cast(1)}; nbatch = 1; } switch(length.size()) { case 1: return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset); case 2: return norm(input, std::make_tuple(length[0], length[1]), nbatch, precision, type, std::make_tuple(stride[0], stride[1]), dist, offset); case 3: return norm(input, std::make_tuple(length[0], length[1], length[2]), nbatch, precision, type, std::make_tuple(stride[0], stride[1], stride[2]), dist, offset); default: abort(); } } // Given a data type and precision, the distance between batches, and // the batch size, allocate the required host buffer(s). static std::vector allocate_host_buffer(const fft_precision precision, const fft_array_type type, const std::vector& size) { std::vector buffers(size.size()); for(unsigned int i = 0; i < size.size(); ++i) { buffers[i].alloc(size[i] * var_size(precision, type)); } return buffers; } // Check if the required buffers fit in the device vram. inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0) { // We keep a small margin of error for fitting the problem into vram: const size_t extra = 1 << 27; return vram_avail > prob_size + extra; } // Computes the twiddle table VRAM footprint for r2c/c2r transforms. // This function will return 0 for the other transform types, since // the VRAM footprint in rocFFT is negligible for the other cases. inline size_t twiddle_table_vram_footprint(const fft_params& params) { size_t vram_footprint = 0; // Add vram footprint from real/complex even twiddle buffer size. if(params.transform_type == fft_transform_type_real_forward || params.transform_type == fft_transform_type_real_inverse) { const auto realdim = params.length.back(); if(realdim % 2 == 0) { const auto complex_size = params.precision == fft_precision_single ? 8 : 16; // even length twiddle size is 1/4 of the real size, but // in complex elements vram_footprint += realdim * complex_size / 4; } } return vram_footprint; } #endif upstream/shared/environment.h0000664000175000017500000000570714637252753015370 0ustar kaolkaol// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // wrappers around environment variable routines #pragma once #include // Windows provides "getenv" and "_putenv", but those modify the // runtime's copy of the environment. The actual environment in the // process control block is accessed using GetEnvironmentVariable and // SetEnvironmentVariable. #ifdef WIN32 #include static void rocfft_setenv(const char* var, const char* value) { SetEnvironmentVariable(var, value); } static void rocfft_unsetenv(const char* var) { SetEnvironmentVariable(var, nullptr); } static std::string rocfft_getenv(const char* var) { DWORD size = GetEnvironmentVariable(var, nullptr, 0); std::string ret; if(size) { ret.resize(size); GetEnvironmentVariable(var, ret.data(), size); // GetEnvironmentVariable counts the terminating null, so remove it while(!ret.empty() && ret.back() == 0) ret.pop_back(); } return ret; } #else #include static void rocfft_setenv(const char* var, const char* value) { setenv(var, value, 1); } static void rocfft_unsetenv(const char* var) { unsetenv(var); } static std::string rocfft_getenv(const char* var) { auto value = getenv(var); return value ? value : ""; } #endif // RAII object to set an environment variable and restore it to its // previous value on destruction struct EnvironmentSetTemp { EnvironmentSetTemp(const char* _var, const char* val) : var(_var) { auto val_ptr = rocfft_getenv(_var); if(!val_ptr.empty()) oldvalue = val_ptr; rocfft_setenv(_var, val); } ~EnvironmentSetTemp() { if(oldvalue.empty()) rocfft_unsetenv(var.c_str()); else rocfft_setenv(var.c_str(), oldvalue.c_str()); } std::string var; std::string oldvalue; }; upstream/shared/rocfft_hip.h0000664000175000017500000000363014637252753015140 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef __ROCFFT_HIP_H__ #define __ROCFFT_HIP_H__ #include #include class rocfft_scoped_device { public: rocfft_scoped_device(int device) { if(hipGetDevice(&orig_device) != hipSuccess) throw std::runtime_error("hipGetDevice failure"); if(hipSetDevice(device) != hipSuccess) throw std::runtime_error("hipSetDevice failure"); } ~rocfft_scoped_device() { (void)hipSetDevice(orig_device); } // not copyable or movable rocfft_scoped_device(const rocfft_scoped_device&) = delete; rocfft_scoped_device(rocfft_scoped_device&&) = delete; rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete; private: int orig_device; }; #endif // __ROCFFT_HIP_H__ upstream/shared/accuracy_test.h0000664000175000017500000022336714637253000015643 0ustar kaolkaol// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef ACCURACY_TEST #define ACCURACY_TEST #include #include #include #include #include #include #include "enum_to_string.h" #include "fft_params.h" #include "fftw_transform.h" #include "gpubuf.h" #include "rocfft_against_fftw.h" #include "test_params.h" extern int verbose; extern size_t ramgb; extern bool fftw_compare; static const size_t ONE_GiB = 1 << 30; inline size_t bytes_to_GiB(const size_t bytes) { return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB; } typedef std::tuple type_place_io_t; // Remember the results of the last FFT we computed with FFTW. Tests // are ordered so that later cases can often reuse this result. struct last_cpu_fft_cache { // keys to the cache std::vector length; size_t nbatch = 0; fft_transform_type transform_type = fft_transform_type_complex_forward; bool run_callbacks = false; fft_precision precision = fft_precision_single; // FFTW input/output std::vector cpu_input; std::vector cpu_output; }; extern last_cpu_fft_cache last_cpu_fft_data; struct system_memory { size_t total_bytes = 0; size_t free_bytes = 0; }; extern system_memory start_memory; system_memory get_system_memory(); // Estimate the amount of host memory needed for buffers. inline size_t needed_ram_buffers(const fft_params& params, const int verbose) { // This calculation is assuming contiguous data but noncontiguous buffers // are assumed to require a close enough amount of space for the purposes // of this estimate. size_t needed_ram = 6 * std::accumulate(params.length.begin(), params.length.end(), static_cast(1), std::multiplies()); // Account for precision and data type: if(params.transform_type != fft_transform_type_real_forward && params.transform_type != fft_transform_type_real_inverse) { needed_ram *= 2; } switch(params.precision) { case fft_precision_half: needed_ram *= 2; break; case fft_precision_single: needed_ram *= 4; break; case fft_precision_double: needed_ram *= 8; break; } needed_ram *= params.nbatch; if(verbose) { std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n"; } return needed_ram; } template bool fftw_plan_uses_bluestein(const typename fftw_trait::fftw_plan_type& cpu_plan) { #ifdef FFTW_HAVE_SPRINT_PLAN char* print_plan_c_str = fftw_sprint_plan(cpu_plan); std::string print_plan(print_plan_c_str); free(print_plan_c_str); return print_plan.find("bluestein") != std::string::npos; #else // assume worst case (bluestein is always used) return true; #endif } // Estimate the amount of host memory needed for fftw. template inline size_t needed_ram_fftw(const fft_params& contiguous_params, const typename fftw_trait::fftw_plan_type& cpu_plan, const int verbose) { size_t total_length = std::accumulate(contiguous_params.length.begin(), contiguous_params.length.end(), static_cast(1), std::multiplies()); size_t needed_ram = 0; // Detect Bluestein in plan if(fftw_plan_uses_bluestein(cpu_plan)) { for(size_t dim : contiguous_params.length) { unsigned int needed_ram_dim = dim; // Next-plus-one-power-of-two multiplied any other lengths needed_ram_dim--; needed_ram_dim |= needed_ram_dim >> 2; needed_ram_dim |= needed_ram_dim >> 4; needed_ram_dim |= needed_ram_dim >> 8; needed_ram_dim |= needed_ram_dim >> 16; needed_ram_dim++; needed_ram_dim *= 2 * (total_length / dim); if(needed_ram_dim > needed_ram) { needed_ram = needed_ram_dim; } } } // Account for precision and data type: if(contiguous_params.transform_type != fft_transform_type_real_forward && contiguous_params.transform_type != fft_transform_type_real_inverse) { needed_ram *= 2; } switch(contiguous_params.precision) { case fft_precision_half: needed_ram *= 2; break; case fft_precision_single: needed_ram *= 4; break; case fft_precision_double: needed_ram *= 8; break; } needed_ram *= contiguous_params.nbatch; if(verbose) { std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n"; } return needed_ram; } // Base gtest class for comparison with FFTW. class accuracy_test : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; const static std::vector batch_range = {2, 1}; const static std::vector precision_range_full = {fft_precision_double, fft_precision_single, fft_precision_half}; const static std::vector precision_range_sp_dp = {fft_precision_double, fft_precision_single}; const static std::vector place_range = {fft_placement_inplace, fft_placement_notinplace}; const static std::vector trans_type_range = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; const static std::vector trans_type_range_complex = {fft_transform_type_complex_forward}; const static std::vector trans_type_range_real = {fft_transform_type_real_forward}; // Given a vector of vector of lengths, generate all unique permutations. // Add an optional vector of ad-hoc lengths to the result. inline std::vector> generate_lengths(const std::vector>& inlengths) { std::vector> output; if(inlengths.size() == 0) { return output; } const size_t dim = inlengths.size(); std::vector looplength(dim); for(unsigned int i = 0; i < dim; ++i) { looplength[i] = inlengths[i].size(); } for(unsigned int idx = 0; idx < inlengths.size(); ++idx) { std::vector index(dim); do { std::vector length(dim); for(unsigned int i = 0; i < dim; ++i) { length[i] = inlengths[i][index[i]]; } output.push_back(length); } while(increment_rowmajor(index, looplength)); } // uniquify the result std::sort(output.begin(), output.end()); output.erase(std::unique(output.begin(), output.end()), output.end()); return output; } // Return the valid rocFFT input and output types for a given transform type. inline std::vector> iotypes(const fft_transform_type transformType, const fft_result_placement place, const bool planar = true) { std::vector> iotypes; switch(transformType) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: iotypes.push_back(std::make_pair( fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)); if(planar) { iotypes.push_back(std::make_pair( fft_array_type_complex_planar, fft_array_type_complex_planar)); if(place == fft_placement_notinplace) { iotypes.push_back(std::make_pair( fft_array_type_complex_planar, fft_array_type_complex_interleaved)); iotypes.push_back(std::make_pair( fft_array_type_complex_interleaved, fft_array_type_complex_planar)); } } break; case fft_transform_type_real_forward: iotypes.push_back(std::make_pair( fft_array_type_real, fft_array_type_hermitian_interleaved)); if(planar && place == fft_placement_notinplace) { iotypes.push_back(std::make_pair( fft_array_type_real, fft_array_type_hermitian_planar)); } break; case fft_transform_type_real_inverse: iotypes.push_back(std::make_pair( fft_array_type_hermitian_interleaved, fft_array_type_real)); if(planar && place == fft_placement_notinplace) { iotypes.push_back(std::make_pair( fft_array_type_hermitian_planar, fft_array_type_real)); } break; default: throw std::runtime_error("Invalid transform type"); } return iotypes; } // Generate all combinations of input/output types, from combinations of transform and placement // types. static std::vector generate_types(fft_transform_type transform_type, const std::vector& place_range, const bool planar) { std::vector ret; for(auto place : place_range) { for(auto iotype : iotypes(transform_type, place, planar)) { ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second)); } } return ret; } struct stride_generator { struct stride_dist { stride_dist(const std::vector& s, size_t d) : stride(s) , dist(d) { } std::vector stride; size_t dist; }; // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer // // cppcheck-suppress noExplicitConstructor stride_generator(const std::vector>& stride_list_in) : stride_list(stride_list_in) { } virtual std::vector generate(const std::vector& lengths, size_t batch) const { std::vector ret; for(const auto& s : stride_list) ret.emplace_back(s, 0); return ret; } std::vector> stride_list; }; // Generate strides such that batch is essentially the innermost dimension // e.g. given a batch-2 4x3x2 transform which logically looks like: // // batch0: // A B A B // A B A B // A B A B // // A B A B // A B A B // A B A B // // batch1: // A B A B // A B A B // A B A B // // A B A B // A B A B // A B A B // // we instead do stride-2 4x3x2 transform where first batch is the // A's and second batch is the B's. struct stride_generator_3D_inner_batch : public stride_generator { explicit stride_generator_3D_inner_batch(const std::vector>& stride_list_in) : stride_generator(stride_list_in) { } std::vector generate(const std::vector& lengths, size_t batch) const override { std::vector ret = stride_generator::generate(lengths, batch); std::vector strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch}; ret.emplace_back(strides, 1); return ret; } }; // Create an array of parameters to pass to gtest. Base generator // that allows choosing transform type. inline auto param_generator_base(const std::vector& type_range, const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, decltype(generate_types) types_generator, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar = true, const bool run_callbacks = false) { std::vector params; // For any length, we compute double-precision CPU reference // for largest batch size first and reuse for smaller batch // sizes, then convert to single-precision. for(auto& transform_type : type_range) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { continue; } { for(const auto precision : precision_range) { for(const auto batch : batch_range) { for(const auto& types : types_generator(transform_type, place_range, planar)) { for(const auto& istride_dist : istride.generate(lengths, batch)) { for(const auto& ostride_dist : ostride.generate(lengths, batch)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = istride_dist.stride; param.ostride = ostride_dist.stride; param.nbatch = batch; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = istride_dist.dist; param.odist = ostride_dist.dist; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; if(run_callbacks) { // add a test if both input and output support callbacks if(param.itype != fft_array_type_complex_planar && param.itype != fft_array_type_hermitian_planar && param.otype != fft_array_type_complex_planar && param.otype != fft_array_type_hermitian_planar) { param.run_callbacks = true; } else { continue; } } param.validate(); // Keeping the random number generator here // allows one to run the same tests for a given // random seed; ie the test suite is repeatable. std::hash hasher; std::ranlux24_base gen(random_seed + hasher(param.token())); std::uniform_real_distribution<> dis(0.0, 1.0); if(param.is_planar()) { const double roll = dis(gen); if(roll > planar_prob) { if(verbose > 4) { std::cout << "Planar transform skipped " "(planar_prob: " << planar_prob << " > " << roll << ")\n"; } continue; } } if(run_callbacks) { const double roll = dis(gen); if(roll > callback_prob) { if(verbose > 4) { std::cout << "Callback transform skipped " "(planar_prob: " << planar_prob << " > " << roll << ")\n"; } continue; } } if(param.valid(0)) { params.push_back(param); } } } } } } } } } } } return params; } // Create an array of parameters to pass to gtest. Default generator // that picks all transform types. inline auto param_generator(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, const bool run_callbacks = false) { return param_generator_base(trans_type_range, v_lengths, precision_range, batch_range, generate_types, istride, ostride, ioffset_range, ooffset_range, place_range, planar, run_callbacks); } // Create an array of parameters to pass to gtest. Only tests complex-type transforms inline auto param_generator_complex(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, const bool run_callbacks = false) { return param_generator_base(trans_type_range_complex, v_lengths, precision_range, batch_range, generate_types, istride, ostride, ioffset_range, ooffset_range, place_range, planar, run_callbacks); } // Create an array of parameters to pass to gtest. inline auto param_generator_real(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, const bool run_callbacks = false) { return param_generator_base(trans_type_range_real, v_lengths, precision_range, batch_range, generate_types, istride, ostride, ioffset_range, ooffset_range, place_range, planar, run_callbacks); } template auto param_generator_token(const Tcontainer& tokens) { std::vector params; params.reserve(tokens.size()); for(auto t : tokens) { params.push_back({}); params.back().from_token(t); } return params; } struct callback_test_data { // scalar to modify the input/output with double scalar; // base address of input, to ensure that each callback gets an offset from that base void* base; }; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse); void apply_load_callback(const fft_params& params, std::vector& input); void apply_store_callback(const fft_params& params, std::vector& output); void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse); static auto allocate_cpu_fft_buffer(const fft_precision precision, const fft_array_type type, const std::vector& size) { // FFTW does not support half-precision, so we do single instead. // So if we need to do a half-precision FFTW transform, allocate // enough buffer for single-precision instead. return allocate_host_buffer( precision == fft_precision_half ? fft_precision_single : precision, type, size); } template inline void execute_cpu_fft(fft_params& params, fft_params& contiguous_params, typename fftw_trait::fftw_plan_type& cpu_plan, std::vector& cpu_input, std::vector& cpu_output) { // CPU output might not be allocated already for us, if FFTW never // needed an output buffer during planning if(cpu_output.empty()) cpu_output = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); // If this is either C2R or callbacks are enabled, the // input will be modified. So we need to modify the copy instead. std::vector cpu_input_copy(cpu_input.size()); std::vector* input_ptr = &cpu_input; if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse) { for(size_t i = 0; i < cpu_input.size(); ++i) { cpu_input_copy[i] = cpu_input[i].copy(); } input_ptr = &cpu_input_copy; } // run FFTW (which may destroy CPU input) apply_load_callback(params, *input_ptr); fftw_run(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output); // clean up fftw_destroy_plan_type(cpu_plan); // ask FFTW to fully clean up, since it tries to cache plan details fftw_cleanup(); cpu_plan = nullptr; apply_store_callback(params, cpu_output); } // execute the GPU transform template inline void execute_gpu_fft(Tparams& params, std::vector& pibuffer, std::vector& pobuffer, std::vector& obuffer, std::vector& gpu_output, bool round_trip_inverse = false) { gpubuf_t load_cb_data_dev; gpubuf_t store_cb_data_dev; if(params.run_callbacks) { void* load_cb_host = get_load_callback_host(params.itype, params.precision, round_trip_inverse); callback_test_data load_cb_data_host; if(round_trip_inverse) { load_cb_data_host.scalar = params.store_cb_scalar; } else { load_cb_data_host.scalar = params.load_cb_scalar; } load_cb_data_host.base = pibuffer.front(); auto hip_status = hipSuccess; hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data)); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } hip_status = hipMemcpy(load_cb_data_dev.data(), &load_cb_data_host, sizeof(callback_test_data), hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } void* store_cb_host = get_store_callback_host(params.otype, params.precision, round_trip_inverse); callback_test_data store_cb_data_host; if(round_trip_inverse) { store_cb_data_host.scalar = params.load_cb_scalar; } else { store_cb_data_host.scalar = params.store_cb_scalar; } store_cb_data_host.base = pobuffer.front(); hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data)); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } hip_status = hipMemcpy(store_cb_data_dev.data(), &store_cb_data_host, sizeof(callback_test_data), hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } auto fft_status = params.set_callbacks( load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data()); if(fft_status != fft_status_success) throw std::runtime_error("set callback failure"); } // Execute the transform: auto fft_status = params.execute(pibuffer.data(), pobuffer.data()); if(fft_status != fft_status_success) throw std::runtime_error("rocFFT plan execution failure"); // if not comparing, then just executing the GPU FFT is all we // need to do if(!fftw_compare) return; // finalize a multi-GPU transform params.multi_gpu_finalize(obuffer, pobuffer); ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) { ASSERT_TRUE(gpu_output[idx].data() != nullptr) << "output buffer index " << idx << " is empty"; auto hip_status = hipMemcpy(gpu_output[idx].data(), pobuffer.at(idx), gpu_output[idx].size(), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure"; } else { GTEST_FAIL() << "hipMemcpy failure"; } } } if(verbose > 2) { std::cout << "GPU output:\n"; params.print_obuffer(gpu_output); } if(verbose > 5) { std::cout << "flat GPU output:\n"; params.print_obuffer_flat(gpu_output); } } template static void assert_init_value(const std::vector& output, const size_t idx, const Tfloat orig_value); template <> void assert_init_value(const std::vector& output, const size_t idx, const float orig_value) { float actual_value = reinterpret_cast(output.front().data())[idx]; ASSERT_EQ(actual_value, orig_value) << "index " << idx; } template <> void assert_init_value(const std::vector& output, const size_t idx, const double orig_value) { double actual_value = reinterpret_cast(output.front().data())[idx]; ASSERT_EQ(actual_value, orig_value) << "index " << idx; } template <> void assert_init_value(const std::vector& output, const size_t idx, const rocfft_complex orig_value) { // if this is interleaved, check directly if(output.size() == 1) { rocfft_complex actual_value = reinterpret_cast*>(output.front().data())[idx]; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } else { // planar rocfft_complex actual_value{ reinterpret_cast(output.front().data())[idx], reinterpret_cast(output.back().data())[idx]}; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } } template <> void assert_init_value(const std::vector& output, const size_t idx, const rocfft_complex orig_value) { // if this is interleaved, check directly if(output.size() == 1) { rocfft_complex actual_value = reinterpret_cast*>(output.front().data())[idx]; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } else { // planar rocfft_complex actual_value{ reinterpret_cast(output.front().data())[idx], reinterpret_cast(output.back().data())[idx]}; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } } static const int OUTPUT_INIT_PATTERN = 0xcd; template void check_single_output_stride(const std::vector& output, const size_t offset, const std::vector& length, const std::vector& stride, const size_t i) { Tfloat orig; memset(static_cast(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat)); size_t curLength = length[i]; size_t curStride = stride[i]; size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1]; size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1]; if(nextSmallerLength == 0) { // this is the fastest dim, indexes that are not multiples of // the stride should be the initial value for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx) { if(idx % curStride != 0) assert_init_value(output, idx, orig); } } else { for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx) { // check that the space after the next smaller dim and the // end of this dim is initial value for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx) assert_init_value(output, idx, orig); check_single_output_stride( output, offset + lengthIdx * curStride, length, stride, i + 1); } } } template void check_output_strides(const std::vector& output, Tparams& params) { // treat batch+dist like highest length+stride, if batch > 1 std::vector length; std::vector stride; if(params.nbatch > 1) { length.push_back(params.nbatch); stride.push_back(params.odist); } auto olength = params.olength(); std::copy(olength.begin(), olength.end(), std::back_inserter(length)); std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride)); if(params.precision == fft_precision_single) { if(params.otype == fft_array_type_real) check_single_output_stride(output, 0, length, stride, 0); else check_single_output_stride>(output, 0, length, stride, 0); } else { if(params.otype == fft_array_type_real) check_single_output_stride(output, 0, length, stride, 0); else check_single_output_stride>(output, 0, length, stride, 0); } } // run rocFFT inverse transform template inline void run_round_trip_inverse(Tparams& params, std::vector& obuffer, std::vector& pibuffer, std::vector& pobuffer, std::vector& gpu_output) { params.validate(); // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); // Create FFT plan - this will also allocate work buffer, but will throw a // specific exception if that step fails auto plan_status = fft_status_success; try { plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { std::stringstream ss; ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")"; ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed"; auto obuffer_sizes = params.obuffer_sizes(); if(params.placement != fft_placement_inplace) { for(unsigned int i = 0; i < obuffer_sizes.size(); ++i) { // If we're validating output strides, init the // output buffer to a known pattern and we can check // that the pattern is untouched in places that // shouldn't have been touched. if(params.check_output_strides) { auto hip_status = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemset failure"; } else { GTEST_FAIL() << "hipMemset failure"; } } } } } // execute GPU transform execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true); } // compare rocFFT inverse transform with forward transform input template inline void compare_round_trip_inverse(Tparams& params, fft_params& contiguous_params, std::vector& gpu_output, std::vector& cpu_input, const VectorNorms& cpu_input_norm, size_t total_length) { if(params.check_output_strides) { check_output_strides(gpu_output, params); } // compute GPU output norm std::shared_future gpu_norm = std::async(std::launch::async, [&]() { return norm(gpu_output, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); }); // compare GPU inverse output to CPU forward input std::unique_ptr>> linf_failures; if(verbose > 1) linf_failures = std::make_unique>>(); const double linf_cutoff = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length); VectorNorms diff = distance(cpu_input, gpu_output, params.olength(), params.nbatch, params.precision, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, params.otype, params.ostride, params.odist, linf_failures.get(), linf_cutoff, {0}, params.ooffset, 1.0 / total_length); if(verbose > 1) { std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; std::cout << "GPU linf norm failures:"; std::sort(linf_failures->begin(), linf_failures->end()); for(const auto& i : *linf_failures) { std::cout << " (" << i.first << "," << i.second << ")"; } std::cout << std::endl; } EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); switch(params.precision) { case fft_precision_half: max_linf_eps_half = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); max_l2_eps_half = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_single: max_linf_eps_single = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); max_l2_eps_single = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_double: max_linf_eps_double = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); max_l2_eps_double = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); break; } if(verbose > 1) { std::cout << "L2 diff: " << diff.l_2 << "\n"; std::cout << "Linf diff: " << diff.l_inf << "\n"; } EXPECT_TRUE(diff.l_inf <= linf_cutoff) << "Linf test failed. Linf:" << diff.l_inf << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff << params.str(); EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2 < sqrt(log2(total_length)) * type_epsilon(params.precision)) << "L2 test failed. L2: " << diff.l_2 << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2 << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) << params.str(); } // RAII type to put data into the cache when this object leaves scope struct StoreCPUDataToCache { StoreCPUDataToCache(std::vector& cpu_input, std::vector& cpu_output) : cpu_input(cpu_input) , cpu_output(cpu_output) { } ~StoreCPUDataToCache() { last_cpu_fft_data.cpu_output.swap(cpu_output); last_cpu_fft_data.cpu_input.swap(cpu_input); } std::vector& cpu_input; std::vector& cpu_output; }; // run CPU + rocFFT transform with the given params and compare template inline void fft_vs_reference_impl(Tparams& params, bool round_trip) { // Call hipGetLastError to reset any errors // returned by previous HIP runtime API calls. hipError_t hip_status = hipGetLastError(); // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); size_t needed_ram = needed_ram_buffers(params, verbose); if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) { GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb << ".\n"; } auto ibuffer_sizes = params.ibuffer_sizes(); auto obuffer_sizes = params.obuffer_sizes(); size_t vram_avail = 0; if(vramgb == 0) { // Check free and total available memory: size_t free = 0; size_t total = 0; auto hip_status = hipMemGetInfo(&free, &total); if(hip_status != hipSuccess || total == 0) { ++n_hip_failures; std::stringstream ss; if(total == 0) ss << "hipMemGetInfo claims there there isn't any vram"; else ss << "hipMemGetInfo failure with error " << hip_status; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } vram_avail = total; } else { vram_avail = vramgb * ONE_GiB; } // First try a quick estimation of vram footprint, to speed up skipping tests // that are too large to fit in the gpu (no plan created with the rocFFT backend) const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, vram_avail)) { GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint) << " GiB) raw data too large for device"; } if(verbose > 2) { std::cout << "Raw problem size: " << raw_vram_footprint << std::endl; } // If it passed the quick estimation test, go for the more // accurate calculation that actually creates the plan and // take into account the work buffer size const auto vram_footprint = params.vram_footprint(); if(!vram_fits_problem(vram_footprint, vram_avail)) { if(verbose) { std::cout << "Problem raw data won't fit on device; skipped." << std::endl; } GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint) << " GiB) raw data too large for device"; } // Create FFT plan - this will also allocate work buffer, but // will throw a specific exception if that step fails auto plan_status = fft_status_success; try { plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { ++n_hip_failures; std::stringstream ss; ss << "Work buffer allocation failed with size: " << params.workbuffersize; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; if(!vram_fits_problem(vram_footprint, vram_avail)) { if(verbose) { std::cout << "Problem won't fit on device; skipped." << std::endl; } GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device"; return; } fft_params contiguous_params; contiguous_params.length = params.length; contiguous_params.precision = params.precision; contiguous_params.placement = fft_placement_notinplace; contiguous_params.transform_type = params.transform_type; contiguous_params.nbatch = params.nbatch; contiguous_params.itype = contiguous_itype(params.transform_type); contiguous_params.otype = contiguous_otype(contiguous_params.transform_type); contiguous_params.validate(); if(!contiguous_params.valid(verbose)) { throw std::runtime_error("Invalid contiguous params"); } if(verbose > 3) { std::cout << "CPU params:\n"; std::cout << contiguous_params.str("\n\t") << std::endl; } std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); if(hip_status != hipSuccess) { std::stringstream ss; ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } pibuffer[i] = ibuffer[i].data(); } // allocation counts in elements, ibuffer_sizes is in bytes auto ibuffer_sizes_elems = ibuffer_sizes; for(auto& buf : ibuffer_sizes_elems) buf /= var_size(params.precision, params.itype); // Check cache first - nbatch is a >= comparison because we compute // the largest batch size and cache it. Smaller batch runs can // compare against the larger data. std::vector cpu_input; std::vector cpu_output; std::shared_future convert_cpu_output_precision; std::shared_future convert_cpu_input_precision; bool run_fftw = true; std::unique_ptr store_to_cache; if(fftw_compare && last_cpu_fft_data.length == params.length && last_cpu_fft_data.transform_type == params.transform_type && last_cpu_fft_data.run_callbacks == params.run_callbacks) { if(last_cpu_fft_data.nbatch >= params.nbatch) { // use the cached input/output cpu_input.swap(last_cpu_fft_data.cpu_input); cpu_output.swap(last_cpu_fft_data.cpu_output); run_fftw = false; store_to_cache = std::make_unique(cpu_input, cpu_output); if(params.precision != last_cpu_fft_data.precision) { // Tests should be ordered so we do wider first, then narrower. switch(params.precision) { case fft_precision_double: std::cerr << "test ordering is incorrect: double precision follows a narrower one" << std::endl; abort(); break; case fft_precision_single: if(last_cpu_fft_data.precision != fft_precision_double) { std::cerr << "test ordering is incorrect: float precision follows a narrower one" << std::endl; abort(); } // convert the input/output to single-precision convert_cpu_output_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_output.front()); }); convert_cpu_input_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_input.front()); }); break; case fft_precision_half: // convert to half precision if(last_cpu_fft_data.precision == fft_precision_double) { convert_cpu_output_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_output.front()); }); convert_cpu_input_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_input.front()); }); } else if(last_cpu_fft_data.precision == fft_precision_single) { convert_cpu_output_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_output.front()); }); convert_cpu_input_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_input.front()); }); } else { std::cerr << "unhandled previous precision, cannot convert to half" << std::endl; abort(); } break; } last_cpu_fft_data.precision = params.precision; } } // If the last result has a smaller batch than the new // params, that might be a developer error - tests should be // ordered to generate the bigger batch first. But if tests // got filtered or skipped due to insufficient memory, we // might never have tried to generate the bigger batch first. // So just fall through and redo the CPU FFT. } else { // Clear cache explicitly so that even if we didn't get a hit, // we're not uselessly holding on to cached cpu input/output last_cpu_fft_data = last_cpu_fft_cache(); } // Allocate CPU input if(run_fftw) { cpu_input = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.itype, contiguous_params.isize); } // Create FFTW plan - this may write to input, but that's fine // since there's nothing in there right now typename fftw_trait::fftw_plan_type cpu_plan = nullptr; if(run_fftw) { // Normally, we would want to defer allocation of CPU output // buffer until when we actually do the CPU FFT. But if we're // using FFTW wisdom, FFTW needs an output buffer at plan // creation time. if(use_fftw_wisdom) { cpu_output = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); } cpu_plan = fftw_plan_via_rocfft(contiguous_params.length, contiguous_params.istride, contiguous_params.ostride, contiguous_params.nbatch, contiguous_params.idist, contiguous_params.odist, contiguous_params.transform_type, cpu_input, cpu_output); needed_ram += needed_ram_fftw(contiguous_params, cpu_plan, verbose); if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) { if(verbose) { std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." << std::endl; } GTEST_SKIP(); return; } } std::vector gpu_input_data; // allocate and populate the input buffer (cpu/gpu) if(run_fftw) { gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); //generate the input directly on the gpu params.compute_input(ibuffer); // Copy the input to CPU if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) { // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { hip_status = hipMemcpy(gpu_input_data.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; } } } copy_buffers(gpu_input_data, cpu_input, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, params.ioffset, contiguous_params.ioffset); } else { // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { hip_status = hipMemcpy(cpu_input.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; } } } } } else if(fftw_compare) { gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); // In case the cached cpu input needed conversion, wait for it if(convert_cpu_input_precision.valid()) convert_cpu_input_precision.get(); // gets a pre-computed gpu input buffer from the cpu cache std::vector* gpu_input = &cpu_input; if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) { copy_buffers(cpu_input, gpu_input_data, params.ilength(), params.nbatch, params.precision, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, params.itype, params.istride, params.idist, {0}, params.ioffset); gpu_input = &gpu_input_data; } // Copy input to GPU for(unsigned int idx = 0; idx < gpu_input->size(); ++idx) { hip_status = hipMemcpy(ibuffer[idx].data(), gpu_input->at(idx).data(), ibuffer_sizes[idx], hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; } } } } if(verbose > 3) { std::cout << "CPU input:\n"; contiguous_params.print_ibuffer(cpu_input); } // compute input norm std::shared_future cpu_input_norm; if(fftw_compare) cpu_input_norm = std::async(std::launch::async, [&]() { // in case the cached cpu input needed conversion, wait for it if(convert_cpu_input_precision.valid()) convert_cpu_input_precision.get(); auto input_norm = norm(cpu_input, contiguous_params.ilength(), contiguous_params.nbatch, contiguous_params.precision, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, contiguous_params.ioffset); if(verbose > 2) { std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n"; std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n"; } return input_norm; }); std::vector obuffer_data; std::vector* obuffer = &obuffer_data; std::vector pobuffer; // allocate the output buffer if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream ss; ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } // If we're validating output strides, init the // output buffer to a known pattern and we can check // that the pattern is untouched in places that // shouldn't have been touched. if(params.check_output_strides) { hip_status = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemset failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemset failure with error " << hip_status; } } } } } pobuffer.resize(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // Run CPU transform // // NOTE: This must happen after input is copied to GPU and input // norm is computed, since the CPU FFT may overwrite the input. VectorNorms cpu_output_norm; std::shared_future cpu_fft; if(fftw_compare) cpu_fft = std::async(std::launch::async, [&]() { // wait for input norm to finish, since we might overwrite input cpu_input_norm.get(); if(run_fftw) execute_cpu_fft(params, contiguous_params, cpu_plan, cpu_input, cpu_output); // in case the cached cpu output needed conversion, wait for it else if(convert_cpu_output_precision.valid()) convert_cpu_output_precision.get(); if(verbose > 3) { std::cout << "CPU output:\n"; contiguous_params.print_obuffer(cpu_output); } cpu_output_norm = norm(cpu_output, params.olength(), params.nbatch, params.precision, contiguous_params.otype, contiguous_params.ostride, contiguous_params.odist, contiguous_params.ooffset); if(verbose > 2) { std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n"; std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n"; } }); // scatter data out to multi-GPUs if this is a multi-GPU test params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); // execute GPU transform std::vector gpu_output = allocate_host_buffer(params.precision, params.otype, params.osize); execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output); params.free(); if(params.check_output_strides) { check_output_strides(gpu_output, params); } // compute GPU output norm std::shared_future gpu_norm; if(fftw_compare) gpu_norm = std::async(std::launch::async, [&]() { return norm(gpu_output, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); }); // compare output // // Compute the l-infinity and l-2 distance between the CPU and GPU output: // wait for cpu FFT so we can compute cutoff const auto total_length = std::accumulate(params.length.begin(), params.length.end(), static_cast(1), std::multiplies()); std::unique_ptr>> linf_failures; if(verbose > 1) linf_failures = std::make_unique>>(); double linf_cutoff; VectorNorms diff; std::shared_future compare_output; if(fftw_compare) compare_output = std::async(std::launch::async, [&]() { cpu_fft.get(); linf_cutoff = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length); diff = distance(cpu_output, gpu_output, params.olength(), params.nbatch, params.precision, contiguous_params.otype, contiguous_params.ostride, contiguous_params.odist, params.otype, params.ostride, params.odist, linf_failures.get(), linf_cutoff, {0}, params.ooffset); }); // Update the cache if this current transform is different from // what's stored. But if this transform only has a smaller batch // than what's cached, we can still keep the cache around since // the input/output we already have is still valid. const bool update_last_cpu_fft_data = last_cpu_fft_data.length != params.length || last_cpu_fft_data.transform_type != params.transform_type || last_cpu_fft_data.run_callbacks != params.run_callbacks || last_cpu_fft_data.precision != params.precision || params.nbatch > last_cpu_fft_data.nbatch; // store cpu output in cache if(update_last_cpu_fft_data) { last_cpu_fft_data.length = params.length; last_cpu_fft_data.nbatch = params.nbatch; last_cpu_fft_data.transform_type = params.transform_type; last_cpu_fft_data.run_callbacks = params.run_callbacks; last_cpu_fft_data.precision = params.precision; } if(compare_output.valid()) compare_output.get(); if(!store_to_cache) store_to_cache = std::make_unique(cpu_input, cpu_output); Tparams params_inverse; if(round_trip) { params_inverse.inverse_from_forward(params); run_round_trip_inverse( params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data); } if(fftw_compare) { ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2)); ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf)); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2)); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf)); if(verbose > 1) { std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; std::cout << "GPU linf norm failures:"; std::sort(linf_failures->begin(), linf_failures->end()); for(const auto& i : *linf_failures) { std::cout << " (" << i.first << "," << i.second << ")"; } std::cout << std::endl; } EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); } switch(params.precision) { case fft_precision_half: max_linf_eps_half = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); max_l2_eps_half = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_single: max_linf_eps_single = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); max_l2_eps_single = std::max(max_l2_eps_single, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_double: max_linf_eps_double = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); max_l2_eps_double = std::max(max_l2_eps_double, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); break; } if(verbose > 1) { std::cout << "L2 diff: " << diff.l_2 << "\n"; std::cout << "Linf diff: " << diff.l_inf << "\n"; } if(fftw_compare) { EXPECT_TRUE(diff.l_inf <= linf_cutoff) << "Linf test failed. Linf:" << diff.l_inf << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf << "\tcutoff: " << linf_cutoff << params.str(); EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2 < sqrt(log2(total_length)) * type_epsilon(params.precision)) << "L2 test failed. L2: " << diff.l_2 << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2 << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) << params.str(); } if(round_trip && fftw_compare) { compare_round_trip_inverse(params_inverse, contiguous_params, gpu_input_data, cpu_input, cpu_input_norm.get(), total_length); } } #endif upstream/shared/device_properties.h0000664000175000017500000000636614637252753016541 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_DEVICE_PROPS_H #define ROCFFT_DEVICE_PROPS_H #include #include #include // get device properties static hipDeviceProp_t get_curr_device_prop() { hipDeviceProp_t prop; int deviceId = 0; if(hipGetDevice(&deviceId) != hipSuccess) throw std::runtime_error("hipGetDevice failed."); if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess) throw std::runtime_error("hipGetDeviceProperties failed for deviceId " + std::to_string(deviceId)); return prop; } // check that the given grid/block dims will fit into the limits in // the device properties. throws std::runtime_error if the limits // are exceeded. static void launch_limits_check(const std::string& kernel_name, const dim3 gridDim, const dim3 blockDim, const hipDeviceProp_t& deviceProp) { // Need lots of casting here because dim3 is unsigned but device // props are signed. Cast direct comparisons to fix signedness // issues. Promote types to 64-bit when multiplying to try to // avoid overflow. // Block limits along each dimension if(blockDim.x > static_cast(deviceProp.maxThreadsDim[0]) || blockDim.y > static_cast(deviceProp.maxThreadsDim[1]) || blockDim.z > static_cast(deviceProp.maxThreadsDim[2])) throw std::runtime_error("max threads per dim exceeded: " + kernel_name); // Total threads for the whole block if(static_cast(blockDim.x) * blockDim.y * blockDim.z > static_cast(deviceProp.maxThreadsPerBlock)) throw std::runtime_error("max threads per block exceeded: " + kernel_name); // Grid dimension limits if(gridDim.x > static_cast(deviceProp.maxGridSize[0]) || gridDim.y > static_cast(deviceProp.maxGridSize[1]) || gridDim.z > static_cast(deviceProp.maxGridSize[2])) throw std::runtime_error("max grid size exceeded: " + kernel_name); } #endif upstream/shared/test_params.h0000664000175000017500000000336514637253000015326 0ustar kaolkaol// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef TESTCONSTANTS_H #define TESTCONSTANTS_H #include extern int verbose; extern size_t ramgb; extern size_t vramgb; extern size_t n_random_tests; extern size_t random_seed; extern double planar_prob; extern double callback_prob; extern double half_epsilon; extern double single_epsilon; extern double double_epsilon; extern bool skip_runtime_fails; extern double max_linf_eps_double; extern double max_l2_eps_double; extern double max_linf_eps_single; extern double max_l2_eps_single; extern double max_linf_eps_half; extern double max_l2_eps_half; extern int n_hip_failures; #endif upstream/shared/array_validator.cpp0000664000175000017500000003644114637252753016541 0ustar kaolkaol// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include "array_validator.h" #include "increment.h" // Check a 2D array for collisions. // The 2D case can be determined via a number-theoretic argument. bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1) { if(s0 == s1) return false; const auto c = std::lcm(s0, s1); return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c)); } // Compare a 1D direction with a multi-index hyperface for collisions. bool valid_length_stride_1d_multi(const unsigned int idx, const std::vector l, const std::vector s, const int verbose) { size_t l0{0}, s0{0}; std::vector l1{}, s1{}; for(unsigned int i = 0; i < l.size(); ++i) { if(i == idx) { l0 = l[i]; s0 = s[i]; } else { l1.push_back(l[i]); s1.push_back(s[i]); } } if(verbose > 4) { std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl; } // We only need to go to the maximum pointer offset for (l1,s1). const auto max_offset = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies()) - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0); std::unordered_set a0{}; for(size_t i = 1; i < l0; ++i) { const auto val = i * s0; if(val <= max_offset) a0.insert(val); else break; } if(verbose > 5) { std::cout << "a0:"; for(auto i : a0) std::cout << " " << i; std::cout << std::endl; std::cout << "l1:"; for(auto i : l1) std::cout << " " << i; std::cout << std::endl; std::cout << "s1:"; for(auto i : s1) std::cout << " " << i; std::cout << std::endl; } // TODO: this can be multi-threaded, since find(...) is thread-safe. std::vector index(l1.size()); std::fill(index.begin(), index.end(), 0); do { const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0); if(i > 0 && (i % s0 == 0)) { // TODO: use an ordered set and binary search if(verbose > 6) std::cout << i << std::endl; if(a0.find(i) != a0.end()) { if(verbose > 4) { std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl; std::cout << "l1:"; for(const auto li : l1) std::cout << " " << li; std::cout << " s1:"; for(const auto si : s1) std::cout << " " << si; std::cout << std::endl; std::cout << "Found duplicate: " << i << std::endl; } return false; } } } while(increment_rowmajor(index, l1)); return true; } // Compare a hyperface with another hyperface for collisions. bool valid_length_stride_multi_multi(const std::vector l0, const std::vector s0, const std::vector l1, const std::vector s1) { std::unordered_set a0{}; const auto max_offset = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies()) - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0); std::vector index0(l0.size()); // TODO: check this std::fill(index0.begin(), index0.end(), 0); do { const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0); if(i > max_offset) a0.insert(i); } while(increment_rowmajor(index0, l0)); std::vector index1(l1.size()); std::fill(index1.begin(), index1.end(), 0); do { const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0); if(i > 0) { // TODO: use an ordered set and binary search if(a0.find(i) != a0.end()) { return false; } } } while(increment_rowmajor(index1, l1)); return true; } bool valid_length_stride_3d(const std::vector& l, const std::vector& s, const int verbose) { // Check that 2D faces are valid: if(!valid_length_stride_2d(l[0], l[1], s[0], s[1])) return false; if(!valid_length_stride_2d(l[0], l[2], s[0], s[2])) return false; if(!valid_length_stride_2d(l[1], l[2], s[1], s[2])) return false; // If the 2D faces are valid, check an axis vs a face for collisions: bool invalid = false; #ifdef _OPENMP #pragma omp parallel for #endif for(int idx = 0; idx < 3; ++idx) { if(!valid_length_stride_1d_multi(idx, l, s, verbose)) { #ifdef _OPENMP #pragma omp cancel for #endif invalid = true; } } if(invalid) return false; return true; } bool valid_length_stride_4d(const std::vector& l, const std::vector& s, const int verbose) { if(l.size() != 4) { throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d"); } // Check that 2D faces are valid: for(int idx0 = 0; idx0 < 3; ++idx0) { for(int idx1 = idx0 + 1; idx1 < 4; ++idx1) { if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1])) return false; } } bool invalid = false; // Check that 1D vs 3D faces are valid: #ifdef _OPENMP #pragma omp parallel for #endif for(int idx0 = 0; idx0 < 4; ++idx0) { if(!valid_length_stride_1d_multi(idx0, l, s, verbose)) { #ifdef _OPENMP #pragma omp cancel for #endif invalid = true; } } if(invalid) return false; // Check that 2D vs 2D faces are valid: // First, get all the permutations std::vector> perms; std::vector v(l.size()); std::fill(v.begin(), v.begin() + 2, 0); std::fill(v.begin() + 2, v.end(), 1); do { perms.push_back(v); if(verbose > 3) { std::cout << "v:"; for(const auto i : v) { std::cout << " " << i; } std::cout << "\n"; } } while(std::next_permutation(v.begin(), v.end())); // Then loop over all of the permutations. #ifdef _OPENMP #pragma omp parallel for #endif for(size_t iperm = 0; iperm < perms.size(); ++iperm) { std::vector l0(2); std::vector s0(2); std::vector l1(2); std::vector s1(2); for(size_t i = 0; i < l.size(); ++i) { if(perms[iperm][i] == 0) { l0.push_back(l[i]); s0.push_back(s[i]); } else { l1.push_back(l[i]); s1.push_back(s[i]); } } if(verbose > 3) { std::cout << "\tl0:"; for(const auto i : l0) { std::cout << " " << i; } std::cout << "\n"; std::cout << "\ts0:"; for(const auto i : s0) { std::cout << " " << i; } std::cout << "\n"; std::cout << "\tl1:"; for(const auto i : l1) { std::cout << " " << i; } std::cout << "\n"; std::cout << "\ts1:"; for(const auto i : s1) { std::cout << " " << i; } std::cout << "\n"; } if(!valid_length_stride_multi_multi(l0, s0, l1, s1)) { #ifdef _OPENMP #pragma omp cancel for #endif invalid = true; } } if(invalid) return false; return true; } bool valid_length_stride_generald(const std::vector l, const std::vector s, const int verbose) { if(verbose > 2) { std::cout << "checking dimension " << l.size() << std::endl; } // Recurse on d-1 hyper-faces: for(unsigned int idx = 0; idx < l.size(); ++idx) { std::vector l0{}; std::vector s0{}; for(size_t i = 0; i < l.size(); ++i) { if(i != idx) { l0.push_back(l[i]); s0.push_back(s[i]); } } if(!array_valid(l0, s0, verbose)) return false; } // Handle the 1D vs (N-1) case: for(unsigned int idx = 0; idx < l.size(); ++idx) { if(!valid_length_stride_1d_multi(idx, l, s, verbose)) return false; } for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0) { const size_t dim1 = l.size() - dim0; if(verbose > 2) std::cout << "dims: " << dim0 << " " << dim1 << std::endl; // We iterate over all permutations of an array of length l.size() which contains dim0 zeros // and dim1 ones. We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the // possibilities. // First, get all the permutations std::vector> perms; std::vector v(l.size()); std::fill(v.begin(), v.begin() + dim1, 0); std::fill(v.begin() + dim1, v.end(), 1); do { perms.push_back(v); if(verbose > 3) { std::cout << "v:"; for(const auto i : v) { std::cout << " " << i; } std::cout << "\n"; } } while(std::next_permutation(v.begin(), v.end())); bool invalid = false; // Then loop over all of the permutations. #ifdef _OPENMP #pragma omp parallel for #endif for(size_t iperm = 0; iperm < perms.size(); ++iperm) { std::vector l0(dim0); std::vector s0(dim0); std::vector l1(dim1); std::vector s1(dim1); for(size_t i = 0; i < l.size(); ++i) { if(v[i] == 0) { l0.push_back(l[i]); s0.push_back(s[i]); } else { l1.push_back(l[i]); s1.push_back(s[i]); } } if(verbose > 3) { std::cout << "\tl0:"; for(const auto i : l0) { std::cout << " " << i; } std::cout << "\n"; std::cout << "\ts0:"; for(const auto i : s0) { std::cout << " " << i; } std::cout << "\n"; std::cout << "\tl1:"; for(const auto i : l1) { std::cout << " " << i; } std::cout << "\n"; std::cout << "\ts1:"; for(const auto i : s1) { std::cout << " " << i; } std::cout << "\n"; } if(!valid_length_stride_multi_multi(l0, s0, l1, s1)) { #ifdef _OPENMP #pragma omp cancel for #endif invalid = true; } } if(invalid) return false; } return true; } bool sort_by_stride(const std::pair& ls0, const std::pair& ls1) { return ls0.second < ls1.second; } bool array_valid(const std::vector& length, const std::vector& stride, const int verbose) { if(length.size() != stride.size()) return false; // If a length is 1, then the stride is irrelevant. // If a length is > 1, then the corresponding stride must be > 1. std::vector l{}, s{}; for(unsigned int i = 0; i < length.size(); ++i) { if(length[i] > 1) { if(stride[i] == 0) return false; l.push_back(length[i]); s.push_back(stride[i]); } } if(length.size() > 1) { // Check happy path. bool happy_path = true; std::vector> ls; for(size_t idx = 0; idx < length.size(); ++idx) { ls.push_back(std::pair(length[idx], stride[idx])); } std::sort(ls.begin(), ls.end(), sort_by_stride); if(verbose > 2) { for(size_t idx = 0; idx < ls.size(); ++idx) { std::cout << ls[idx].first << "\t" << ls[idx].second << "\n"; } } for(size_t idx = 1; idx < ls.size(); ++idx) { if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second) { happy_path = false; break; } } if(happy_path) { if(verbose > 2) { std::cout << "happy path\n"; } return true; } } switch(l.size()) { case 0: return true; break; case 1: return s[0] != 0; break; case 2: { return valid_length_stride_2d(l[0], l[1], s[0], s[1]); break; } case 3: { return valid_length_stride_3d(l, s, verbose); break; } case 4: { return valid_length_stride_4d(l, s, verbose); break; } default: return valid_length_stride_generald(l, s, verbose); return true; } return true; } upstream/shared/concurrency.h0000664000175000017500000000324414637252753015350 0ustar kaolkaol// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #include #ifndef WIN32 #include #endif // work out how many parallel tasks to run, based on available // resources. on Linux, this will look at the cpu affinity mask (if // available) which might be restricted in a container. otherwise, // return std::thread::hardware_concurrency(). static unsigned int rocfft_concurrency() { #ifndef WIN32 cpu_set_t cpuset; if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0) return CPU_COUNT(&cpuset); #endif return std::thread::hardware_concurrency(); } upstream/shared/data_gen_device.h0000664000175000017500000014426514637253000016072 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef DATA_GEN_DEVICE_H #define DATA_GEN_DEVICE_H // rocRAND can generate warnings if inline asm is not available for // some architectures. data generation isn't performance-critical, // so just disable inline asm to prevent the warnings. #define ROCRAND_DISABLE_INLINE_ASM #include "../shared/arithmetic.h" #include "../shared/device_properties.h" #include "../shared/gpubuf.h" #include "../shared/increment.h" #include "../shared/rocfft_complex.h" #include #include #include #include #include #include static const unsigned int DATA_GEN_THREADS = 8; static const unsigned int DATA_GEN_GRID_Y_MAX = 64; template struct input_val_1D { T val1; }; template struct input_val_2D { T val1; T val2; }; template struct input_val_3D { T val1; T val2; T val3; }; template static input_val_1D get_input_val(const T& val) { return input_val_1D{val}; } template static input_val_2D get_input_val(const std::tuple& val) { return input_val_2D{std::get<0>(val), std::get<1>(val)}; } template static input_val_3D get_input_val(const std::tuple& val) { return input_val_3D{std::get<0>(val), std::get<1>(val), std::get<2>(val)}; } template __device__ static size_t compute_index(const input_val_1D& length, const input_val_1D& stride, size_t base) { return (length.val1 * stride.val1) + base; } template __device__ static size_t compute_index(const input_val_2D& length, const input_val_2D& stride, size_t base) { return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base; } template __device__ static size_t compute_index(const input_val_3D& length, const input_val_3D& stride, size_t base) { return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3) + base; } template static inline input_val_1D make_zero_length(const input_val_1D& whole_length) { return input_val_1D{0}; } template static inline input_val_2D make_zero_length(const input_val_2D& whole_length) { return input_val_2D{0, 0}; } template static inline input_val_3D make_zero_length(const input_val_3D& whole_length) { return input_val_3D{0, 0, 0}; } template static inline input_val_1D make_unit_stride(const input_val_1D& whole_length) { return input_val_1D{1}; } template static inline input_val_2D make_unit_stride(const input_val_2D& whole_length) { return input_val_2D{1, whole_length.val1}; } template static inline input_val_3D make_unit_stride(const input_val_3D& whole_length) { return input_val_3D{1, whole_length.val1, whole_length.val1 * whole_length.val2}; } template __device__ static input_val_1D get_length(const size_t i, const input_val_1D& whole_length) { auto xlen = whole_length.val1; auto xidx = i % xlen; return input_val_1D{xidx}; } template __device__ static input_val_2D get_length(const size_t i, const input_val_2D& whole_length) { auto xlen = whole_length.val1; auto ylen = whole_length.val2; auto xidx = i % xlen; auto yidx = i / xlen % ylen; return input_val_2D{xidx, yidx}; } template __device__ static input_val_3D get_length(const size_t i, const input_val_3D& whole_length) { auto xlen = whole_length.val1; auto ylen = whole_length.val2; auto zlen = whole_length.val3; auto xidx = i % xlen; auto yidx = i / xlen % ylen; auto zidx = i / xlen / ylen % zlen; return input_val_3D{xidx, yidx, zidx}; } template __device__ static size_t get_batch(const size_t i, const input_val_1D& whole_length) { auto xlen = whole_length.val1; auto yidx = i / xlen; return yidx; } template __device__ static size_t get_batch(const size_t i, const input_val_2D& whole_length) { auto xlen = whole_length.val1; auto ylen = whole_length.val2; auto zidx = i / xlen / ylen; return zidx; } template __device__ static size_t get_batch(const size_t i, const input_val_3D& length) { auto xlen = length.val1; auto ylen = length.val2; auto zlen = length.val3; auto widx = i / xlen / ylen / zlen; return widx; } __device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset) { return hiprand_uniform_double(gen_state) + offset; } __device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset) { return hiprand_uniform(gen_state) + offset; } __device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset) { return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset; } template __device__ static void set_imag_zero(const size_t pos, Tcomplex* x) { x[pos].y = 0.0; } template __device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag) { ximag[pos] = 0.0; } template __device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x) { x[pos].x = x[cpos].x; x[pos].y = -x[cpos].y; } template __device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag) { xreal[pos] = xreal[cpos]; ximag[pos] = -ximag[cpos]; } template __global__ static void __launch_bounds__(DATA_GEN_THREADS) generate_random_interleaved_data_kernel(const Tint whole_length, const Tint zero_length, const size_t idist, const size_t isize, const Tint istride, rocfft_complex* data) { auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * DATA_GEN_THREADS; static_assert(sizeof(i) >= sizeof(isize)); if(i < isize) { auto i_length = get_length(i, whole_length); auto i_batch = get_batch(i, whole_length); auto i_base = i_batch * idist; auto seed = compute_index(zero_length, istride, i_base); auto idx = compute_index(i_length, istride, i_base); hiprandStatePhilox4_32_10 gen_state; hiprand_init(seed, idx, 0, &gen_state); data[idx].x = make_random_val(&gen_state, static_cast(-0.5)); data[idx].y = make_random_val(&gen_state, static_cast(-0.5)); } } template __global__ static void __launch_bounds__(DATA_GEN_THREADS) generate_interleaved_data_kernel(const Tint whole_length, const size_t idist, const size_t isize, const Tint istride, const Tint ustride, const Treal inv_scale, rocfft_complex* data) { auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * DATA_GEN_THREADS; static_assert(sizeof(i) >= sizeof(isize)); if(i < isize) { const auto i_length = get_length(i, whole_length); const auto i_batch = get_batch(i, whole_length); const auto i_base = i_batch * idist; const auto val = static_cast(-0.5) + static_cast( static_cast(compute_index(i_length, ustride, 0))) * inv_scale; const auto idx = compute_index(i_length, istride, i_base); data[idx].x = val; data[idx].y = val; } } template __global__ static void __launch_bounds__(DATA_GEN_THREADS) generate_random_planar_data_kernel(const Tint whole_length, const Tint zero_length, const size_t idist, const size_t isize, const Tint istride, Treal* real_data, Treal* imag_data) { auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * DATA_GEN_THREADS; static_assert(sizeof(i) >= sizeof(isize)); if(i < isize) { auto i_length = get_length(i, whole_length); auto i_batch = get_batch(i, whole_length); auto i_base = i_batch * idist; auto seed = compute_index(zero_length, istride, i_base); auto idx = compute_index(i_length, istride, i_base); hiprandStatePhilox4_32_10 gen_state; hiprand_init(seed, idx, 0, &gen_state); real_data[idx] = make_random_val(&gen_state, static_cast(-0.5)); imag_data[idx] = make_random_val(&gen_state, static_cast(-0.5)); } } template __global__ static void __launch_bounds__(DATA_GEN_THREADS) generate_planar_data_kernel(const Tint whole_length, const size_t idist, const size_t isize, const Tint istride, const Tint ustride, const Treal inv_scale, Treal* real_data, Treal* imag_data) { auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * DATA_GEN_THREADS; static_assert(sizeof(i) >= sizeof(isize)); if(i < isize) { const auto i_length = get_length(i, whole_length); const auto i_batch = get_batch(i, whole_length); const auto i_base = i_batch * idist; const auto val = static_cast(-0.5) + static_cast( static_cast(compute_index(i_length, ustride, 0))) * inv_scale; const auto idx = compute_index(i_length, istride, i_base); real_data[idx] = val; imag_data[idx] = val; } } template __global__ static void __launch_bounds__(DATA_GEN_THREADS) generate_random_real_data_kernel(const Tint whole_length, const Tint zero_length, const size_t idist, const size_t isize, const Tint istride, Treal* data) { auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * DATA_GEN_THREADS; static_assert(sizeof(i) >= sizeof(isize)); if(i < isize) { auto i_length = get_length(i, whole_length); auto i_batch = get_batch(i, whole_length); auto i_base = i_batch * idist; auto seed = compute_index(zero_length, istride, i_base); auto idx = compute_index(i_length, istride, i_base); hiprandStatePhilox4_32_10 gen_state; hiprand_init(seed, idx, 0, &gen_state); data[idx] = make_random_val(&gen_state, static_cast(-0.5)); } } template __global__ static void __launch_bounds__(DATA_GEN_THREADS) generate_real_data_kernel(const Tint whole_length, const size_t idist, const size_t isize, const Tint istride, const Tint ustride, const Treal inv_scale, Treal* data) { auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * DATA_GEN_THREADS; static_assert(sizeof(i) >= sizeof(isize)); if(i < isize) { const auto i_length = get_length(i, whole_length); const auto i_batch = get_batch(i, whole_length); const auto i_base = i_batch * idist; const auto val = static_cast(-0.5) + static_cast( static_cast(compute_index(i_length, ustride, 0))) * inv_scale; const auto idx = compute_index(i_length, istride, i_base); data[idx] = val; } } // For complex-to-real transforms, the input data must be Hermitiam-symmetric. // That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier // space. For multi-dimensional data, this means that we only need to store a bit more // than half of the complex values; the rest are redundant. However, there are still // some restrictions: // * the origin and Nyquist value(s) must be real-valued // * some of the remaining values are still redundant, and you might get different results // than you expect if the values don't agree. template __global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex* x, const size_t Nx, const size_t xstride, const size_t dist, const size_t batch_total, const bool Nxeven) { auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; static_assert(sizeof(id_batch) == sizeof(size_t)); if(id_batch < batch_total) { id_batch *= dist; set_imag_zero(id_batch, x); if(Nxeven) set_imag_zero(id_batch + (Nx / 2) * xstride, x); } } template __global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat* xreal, Tfloat* ximag, const size_t Nx, const size_t xstride, const size_t dist, const size_t batch_total, const bool Nxeven) { auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; static_assert(sizeof(id_batch) == sizeof(size_t)); if(id_batch < batch_total) { id_batch *= dist; set_imag_zero(id_batch, xreal, ximag); if(Nxeven) set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag); } } template __global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride, const size_t dist, const size_t batch_total, const size_t x_total, const bool Nxeven, const bool Nyeven) { auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; static_assert(sizeof(id_batch) == sizeof(size_t)); static_assert(sizeof(id_x) == sizeof(size_t)); if(id_batch < batch_total) { id_batch *= dist; if(id_x == 0) set_imag_zero(id_batch, x); if(id_x == 0 && Nxeven) set_imag_zero(id_batch + (Nx / 2) * xstride, x); if(id_x == 0 && Nyeven) set_imag_zero(id_batch + ystride * (Ny / 2), x); if(id_x == 0 && Nxeven && Nyeven) set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x); if(id_x < x_total) { conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x); if(Nyeven) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), x); } } } template __global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat* xreal, Tfloat* ximag, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride, const size_t dist, const size_t batch_total, const size_t x_total, const bool Nxeven, const bool Nyeven) { auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; static_assert(sizeof(id_batch) == sizeof(size_t)); static_assert(sizeof(id_x) == sizeof(size_t)); if(id_batch < batch_total) { id_batch *= dist; if(id_x == 0) set_imag_zero(id_batch, xreal, ximag); if(id_x == 0 && Nxeven) set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag); if(id_x == 0 && Nyeven) set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag); if(id_x == 0 && Nxeven && Nyeven) set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag); if(id_x < x_total) { conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), xreal, ximag); if(Nyeven) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), xreal, ximag); } } } template __global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride, const size_t dist, const size_t batch_total, const size_t x_total, const size_t y_total, const size_t y_total_half, const bool Nxeven, const bool Nyeven, const bool Nzeven) { auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; const auto id_y = static_cast(threadIdx.z) + blockIdx.z * blockDim.z; static_assert(sizeof(id_batch) == sizeof(size_t)); static_assert(sizeof(id_x) == sizeof(size_t)); static_assert(sizeof(id_y) == sizeof(size_t)); if(id_batch < batch_total) { auto id_x_y_zero = (id_x == 0 && id_y == 0); id_batch *= dist; if(id_x_y_zero) set_imag_zero(id_batch, x); if(Nxeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2), x); if(Nyeven && id_x_y_zero) set_imag_zero(id_batch + ystride * (Ny / 2), x); if(Nzeven && id_x_y_zero) set_imag_zero(id_batch + zstride * (Nz / 2), x); if(Nxeven && Nyeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x); if(Nxeven && Nzeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x); if(Nyeven && Nzeven && id_x_y_zero) set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x); if(Nxeven && Nyeven && Nzeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2), x); if(id_x == 0 && id_y < y_total_half) conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x); if(Nxeven && id_x == 0 && id_y < y_total_half) conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)), id_batch + xstride * (Nx / 2) + ystride * (id_y + 1), x); if(id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x); if(Nyeven && id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), x); if(id_x < x_total && id_y < y_total) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)), id_batch + xstride * (id_x + 1) + ystride * (id_y + 1), x); if(Nzeven) { if(id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), x); if(Nyeven && id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), x); if(id_x == 0 && id_y < y_total_half) conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), id_batch + ystride * (id_y + 1) + zstride * (Nz / 2), x); if(Nxeven && id_x == 0 && id_y < y_total_half) conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2), x); if(id_x < x_total && id_y < y_total) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), id_batch + xstride * (id_x + 1) + ystride * (id_y + 1) + zstride * (Nz / 2), x); } } } template __global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat* xreal, Tfloat* ximag, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride, const size_t dist, const size_t batch_total, const size_t x_total, const size_t y_total, const size_t y_total_half, const bool Nxeven, const bool Nyeven, const bool Nzeven) { auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; const auto id_y = static_cast(threadIdx.z) + blockIdx.z * blockDim.z; static_assert(sizeof(id_batch) == sizeof(size_t)); static_assert(sizeof(id_x) == sizeof(size_t)); static_assert(sizeof(id_y) == sizeof(size_t)); if(id_batch < batch_total) { auto id_x_y_zero = (id_x == 0 && id_y == 0); id_batch *= dist; if(id_x_y_zero) set_imag_zero(id_batch, xreal, ximag); if(Nxeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag); if(Nyeven && id_x_y_zero) set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag); if(Nzeven && id_x_y_zero) set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag); if(Nxeven && Nyeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag); if(Nxeven && Nzeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag); if(Nyeven && Nzeven && id_x_y_zero) set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag); if(Nxeven && Nyeven && Nzeven && id_x_y_zero) set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag); if(id_x == 0 && id_y < y_total_half) conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), xreal, ximag); if(Nxeven && id_x == 0 && id_y < y_total_half) conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)), id_batch + xstride * (Nx / 2) + ystride * (id_y + 1), xreal, ximag); if(id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), xreal, ximag); if(Nyeven && id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), xreal, ximag); if(id_x < x_total && id_y < y_total) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)), id_batch + xstride * (id_x + 1) + ystride * (id_y + 1), xreal, ximag); if(Nzeven) { if(id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), xreal, ximag); if(Nyeven && id_x < x_total && id_y == 0) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), xreal, ximag); if(id_x == 0 && id_y < y_total_half) conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), id_batch + ystride * (id_y + 1) + zstride * (Nz / 2), xreal, ximag); if(Nxeven && id_x == 0 && id_y < y_total_half) conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2), xreal, ximag); if(id_x < x_total && id_y < y_total) conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), id_batch + xstride * (id_x + 1) + ystride * (id_y + 1) + zstride * (Nz / 2), xreal, ximag); } } } // get grid dimensions for data gen kernel static dim3 generate_data_gridDim(const size_t isize) { auto blockSize = DATA_GEN_THREADS; // total number of blocks needed in the grid auto numBlocks_setup = DivRoundingUp(isize, blockSize); // Total work items per dimension in the grid is counted in // uint32_t. Since each thread initializes one element, very // large amounts of data will overflow this total size if we do // all this work in one grid dimension, causing launch failure. // // CUDA also generally allows for effectively unlimited grid X // dim, but Y and Z are more limited. auto gridDim_y = std::min(DATA_GEN_GRID_Y_MAX, numBlocks_setup); auto gridDim_x = DivRoundingUp(numBlocks_setup, DATA_GEN_GRID_Y_MAX); return {gridDim_x, gridDim_y}; } // get grid dimensions for hermitian symmetrizer kernel static dim3 generate_hermitian_gridDim(const std::vector& length, const size_t batch, const size_t blockSize) { dim3 gridDim; switch(length.size()) { case 1: gridDim = dim3(DivRoundingUp(batch, blockSize)); break; case 2: gridDim = dim3(DivRoundingUp(batch, blockSize), DivRoundingUp((length[0] + 1) / 2 - 1, blockSize)); break; case 3: gridDim = dim3(DivRoundingUp(batch, blockSize), DivRoundingUp((length[0] + 1) / 2 - 1, blockSize), DivRoundingUp(length[1] - 1, blockSize)); break; default: throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); } return gridDim; } static dim3 generate_blockDim(const std::vector& length, const size_t blockSize) { dim3 blockDim; switch(length.size()) { case 1: blockDim = dim3(blockSize); break; case 2: blockDim = dim3(blockSize, blockSize); break; case 3: blockDim = dim3(blockSize, blockSize, blockSize); break; default: throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); } return blockDim; } template static void generate_random_interleaved_data(const Tint& whole_length, const size_t idist, const size_t isize, const Tint& whole_stride, rocfft_complex* input_data, const hipDeviceProp_t& deviceProp) { auto input_length = get_input_val(whole_length); auto zero_length = make_zero_length(input_length); auto input_stride = get_input_val(whole_stride); dim3 gridDim = generate_data_gridDim(isize); dim3 blockDim{DATA_GEN_THREADS}; launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL( HIP_KERNEL_NAME(generate_random_interleaved_data_kernel), gridDim, blockDim, 0, // sharedMemBytes 0, // stream input_length, zero_length, idist, isize, input_stride, input_data); auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void generate_interleaved_data(const Tint& whole_length, const size_t idist, const size_t isize, const Tint& whole_stride, const size_t nbatch, rocfft_complex* input_data, const hipDeviceProp_t& deviceProp) { const auto input_length = get_input_val(whole_length); const auto input_stride = get_input_val(whole_stride); const auto unit_stride = make_unit_stride(input_length); const auto inv_scale = static_cast(1.0) / static_cast(static_cast(isize) / nbatch - 1); dim3 gridDim = generate_data_gridDim(isize); dim3 blockDim{DATA_GEN_THREADS}; launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL( HIP_KERNEL_NAME(generate_interleaved_data_kernel), gridDim, blockDim, 0, // sharedMemBytes 0, // stream input_length, idist, isize, input_stride, unit_stride, inv_scale, input_data); auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("generate_interleaved_data_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void generate_random_planar_data(const Tint& whole_length, const size_t idist, const size_t isize, const Tint& whole_stride, Treal* real_data, Treal* imag_data, const hipDeviceProp_t& deviceProp) { const auto input_length = get_input_val(whole_length); const auto zero_length = make_zero_length(input_length); const auto input_stride = get_input_val(whole_stride); dim3 gridDim = generate_data_gridDim(isize); dim3 blockDim{DATA_GEN_THREADS}; launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL( HIP_KERNEL_NAME(generate_random_planar_data_kernel), gridDim, blockDim, 0, // sharedMemBytes 0, // stream input_length, zero_length, idist, isize, input_stride, real_data, imag_data); auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("generate_random_planar_data_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void generate_planar_data(const Tint& whole_length, const size_t idist, const size_t isize, const Tint& whole_stride, const size_t nbatch, Treal* real_data, Treal* imag_data, const hipDeviceProp_t& deviceProp) { const auto input_length = get_input_val(whole_length); const auto input_stride = get_input_val(whole_stride); const auto unit_stride = make_unit_stride(input_length); const auto inv_scale = static_cast(1.0) / static_cast(static_cast(isize) / nbatch - 1); dim3 gridDim = generate_data_gridDim(isize); dim3 blockDim{DATA_GEN_THREADS}; launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel), gridDim, blockDim, 0, // sharedMemBytes 0, // stream input_length, idist, isize, input_stride, unit_stride, inv_scale, real_data, imag_data); auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("generate_planar_data_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void generate_random_real_data(const Tint& whole_length, const size_t idist, const size_t isize, const Tint& whole_stride, Treal* input_data, const hipDeviceProp_t& deviceProp) { const auto input_length = get_input_val(whole_length); const auto zero_length = make_zero_length(input_length); const auto input_stride = get_input_val(whole_stride); dim3 gridDim = generate_data_gridDim(isize); dim3 blockDim{DATA_GEN_THREADS}; launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL( HIP_KERNEL_NAME(generate_random_real_data_kernel), gridDim, blockDim, 0, // sharedMemBytes 0, // stream input_length, zero_length, idist, isize, input_stride, input_data); auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("generate_random_real_data_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void generate_real_data(const Tint& whole_length, const size_t idist, const size_t isize, const Tint& whole_stride, const size_t nbatch, Treal* input_data, const hipDeviceProp_t& deviceProp) { const auto input_length = get_input_val(whole_length); const auto input_stride = get_input_val(whole_stride); const auto unit_stride = make_unit_stride(input_length); const auto inv_scale = static_cast(1.0) / static_cast(static_cast(isize) / nbatch - 1); dim3 gridDim = generate_data_gridDim(isize); dim3 blockDim{DATA_GEN_THREADS}; launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel), gridDim, blockDim, 0, // sharedMemBytes 0, // stream input_length, idist, isize, input_stride, unit_stride, inv_scale, input_data); auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("generate_real_data_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void impose_hermitian_symmetry_interleaved(const std::vector& length, const std::vector& ilength, const std::vector& stride, const size_t dist, const size_t batch, Tcomplex* input_data, const hipDeviceProp_t& deviceProp) { auto blockSize = DATA_GEN_THREADS; auto blockDim = generate_blockDim(length, blockSize); auto gridDim = generate_hermitian_gridDim(length, batch, blockSize); switch(length.size()) { case 1: { launch_limits_check( "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel, gridDim, blockDim, 0, 0, input_data, length[0], stride[0], dist, batch, length[0] % 2 == 0); break; } case 2: { launch_limits_check( "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel, gridDim, blockDim, 0, 0, input_data, length[0], length[1], stride[0], stride[1], dist, batch, (ilength[0] + 1) / 2 - 1, length[0] % 2 == 0, length[1] % 2 == 0); break; } case 3: { launch_limits_check( "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel, gridDim, blockDim, 0, 0, input_data, length[0], length[1], length[2], stride[0], stride[1], stride[2], dist, batch, (ilength[0] + 1) / 2 - 1, ilength[1] - 1, (ilength[1] + 1) / 2 - 1, length[0] % 2 == 0, length[1] % 2 == 0, length[2] % 2 == 0); break; } default: throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: " + std::string(hipGetErrorName(err))); } template static void impose_hermitian_symmetry_planar(const std::vector& length, const std::vector& ilength, const std::vector& stride, const size_t dist, const size_t batch, Tfloat* input_data_real, Tfloat* input_data_imag, const hipDeviceProp_t& deviceProp) { auto blockSize = DATA_GEN_THREADS; auto blockDim = generate_blockDim(length, blockSize); auto gridDim = generate_hermitian_gridDim(length, batch, blockSize); switch(length.size()) { case 1: { launch_limits_check( "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel, gridDim, blockDim, 0, 0, input_data_real, input_data_imag, length[0], stride[0], dist, batch, length[0] % 2 == 0); break; } case 2: { launch_limits_check( "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel, gridDim, blockDim, 0, 0, input_data_real, input_data_imag, length[0], length[1], stride[0], stride[1], dist, batch, (ilength[0] + 1) / 2 - 1, length[0] % 2 == 0, length[1] % 2 == 0); break; } case 3: { launch_limits_check( "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp); hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel, gridDim, blockDim, 0, 0, input_data_real, input_data_imag, length[0], length[1], length[2], stride[0], stride[1], stride[2], dist, batch, (ilength[0] + 1) / 2 - 1, ilength[1] - 1, (ilength[1] + 1) / 2 - 1, length[0] % 2 == 0, length[1] % 2 == 0, length[2] % 2 == 0); break; } default: throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: " + std::string(hipGetErrorName(err))); } #endif // DATA_GEN_DEVICE_H upstream/shared/array_validator.h0000664000175000017500000000270414637252753016201 0ustar kaolkaol// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ARRAY_VALIDATOR_H #define ARRAY_VALIDATOR_H #include // Checks whether the array with given length and stride has multi-index collisions. bool array_valid(const std::vector& length, const std::vector& stride, const int verbose = 0); #endif upstream/shared/data_gen_host.h0000664000175000017500000010370214637253000015577 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef DATA_GEN_HOST_H #define DATA_GEN_HOST_H #include "../shared/hostbuf.h" #include "../shared/increment.h" #include #include #include #include #include // Specialized computation of index given 1-, 2-, 3- dimension length + stride template size_t compute_index(T1 length, T2 stride, size_t base) { return (length * stride) + base; } template size_t compute_index(const std::tuple& length, const std::tuple& stride, size_t base) { static_assert(std::is_integral::value, "Integral required."); static_assert(std::is_integral::value, "Integral required."); return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) + base; } template size_t compute_index(const std::tuple& length, const std::tuple& stride, size_t base) { static_assert(std::is_integral::value, "Integral required."); static_assert(std::is_integral::value, "Integral required."); return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) + (std::get<2>(length) * std::get<2>(stride)) + base; } // count the number of total iterations for 1-, 2-, and 3-D dimensions template size_t count_iters(const T1& i) { return i; } template size_t count_iters(const std::tuple& i) { return std::get<0>(i) * std::get<1>(i); } template size_t count_iters(const std::tuple& i) { return std::get<0>(i) * std::get<1>(i) * std::get<2>(i); } template T1 make_unit_stride(const T1& whole_length) { return static_cast(1); } template std::tuple make_unit_stride(const std::tuple& whole_length) { return std::make_tuple(static_cast(1), static_cast(std::get<0>(whole_length))); } template std::tuple make_unit_stride(const std::tuple& whole_length) { return std::make_tuple(static_cast(1), static_cast(std::get<0>(whole_length)), static_cast(std::get<0>(whole_length)) * static_cast(std::get<1>(whole_length))); } // Work out how many partitions to break our iteration problem into template static size_t compute_partition_count(T1 length) { #ifdef _OPENMP // we seem to get contention from too many threads, which slows // things down. particularly noticeable with mix_3D tests static const size_t MAX_PARTITIONS = 8; size_t iters = count_iters(length); size_t hw_threads = std::min(MAX_PARTITIONS, static_cast(omp_get_num_procs())); if(!hw_threads) return 1; // don't bother threading problem sizes that are too small. pick // an arbitrary number of iterations and ensure that each thread // has at least that many iterations to process static const size_t MIN_ITERS_PER_THREAD = 2048; // either use the whole CPU, or use ceil(iters/iters_per_thread) return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD); #else return 1; #endif } // Break a scalar length into some number of pieces, returning // [(start0, end0), (start1, end1), ...] template std::vector> partition_base(const T1& length, size_t num_parts) { static_assert(std::is_integral::value, "Integral required."); // make sure we don't exceed the length num_parts = std::min(length, num_parts); std::vector> ret(num_parts); auto partition_size = length / num_parts; T1 cur_partition = 0; for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size) { ret[i].first = cur_partition; ret[i].second = cur_partition + partition_size; } // last partition might not divide evenly, fix it up ret.back().second = length; return ret; } // Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths template std::vector> partition_rowmajor(const T1& length) { return partition_base(length, compute_partition_count(length)); } // Partition on the leftmost part of the tuple, for row-major indexing template std::vector, std::tuple>> partition_rowmajor(const std::tuple& length) { auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); std::vector, std::tuple>> ret(partitions.size()); for(size_t i = 0; i < partitions.size(); ++i) { std::get<0>(ret[i].first) = partitions[i].first; std::get<1>(ret[i].first) = 0; std::get<0>(ret[i].second) = partitions[i].second; std::get<1>(ret[i].second) = std::get<1>(length); } return ret; } template std::vector, std::tuple>> partition_rowmajor(const std::tuple& length) { auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); std::vector, std::tuple>> ret(partitions.size()); for(size_t i = 0; i < partitions.size(); ++i) { std::get<0>(ret[i].first) = partitions[i].first; std::get<1>(ret[i].first) = 0; std::get<2>(ret[i].first) = 0; std::get<0>(ret[i].second) = partitions[i].second; std::get<1>(ret[i].second) = std::get<1>(length); std::get<2>(ret[i].second) = std::get<2>(length); } return ret; } // For complex-to-real transforms, the input data must be Hermitiam-symmetric. // That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier // space. For multi-dimensional data, this means that we only need to store a bit more // than half of the complex values; the rest are redundant. However, there are still // some restrictions: // * the origin and Nyquist value(s) must be real-valued // * some of the remaining values are still redundant, and you might get different results // than you expect if the values don't agree. // Below are some example kernels which impose Hermitian symmetry on a complex array // of the given dimensions. template static void impose_hermitian_symmetry_interleaved_1D(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) { auto data = ((std::complex*)vals[0].data()) + ibatch * idist; data[0].imag(0.0); if(length[0] % 2 == 0) { data[istride[0] * (length[0] / 2)].imag(0.0); } } } template static void impose_hermitian_symmetry_planar_1D(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) { auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; data_imag[0] = 0.0; if(length[0] % 2 == 0) { data_imag[istride[0] * (length[0] / 2)] = 0.0; } } } template static void impose_hermitian_symmetry_interleaved_2D(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) { auto data = ((std::complex*)vals[0].data()) + ibatch * idist; data[0].imag(0.0); if(length[0] % 2 == 0) { data[istride[0] * (length[0] / 2)].imag(0.0); } if(length[1] % 2 == 0) { data[istride[1] * (length[1] / 2)].imag(0.0); } if(length[0] % 2 == 0 && length[1] % 2 == 0) { data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0); } for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]); } if(length[1] % 2 == 0) { for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]); } } } } template static void impose_hermitian_symmetry_planar_2D(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) { auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist; auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; data_imag[0] = 0.0; if(length[0] % 2 == 0) { data_imag[istride[0] * (length[0] / 2)] = 0.0; } if(length[1] % 2 == 0) { data_imag[istride[1] * (length[1] / 2)] = 0.0; } if(length[0] % 2 == 0 && length[1] % 2 == 0) { data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0; } for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i]; data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i]; } if(length[1] % 2 == 0) { for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] = data_real[istride[0] * i + istride[1] * (length[1] / 2)]; data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)]; } } } } template static void impose_hermitian_symmetry_interleaved_3D(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) { auto data = ((std::complex*)vals[0].data()) + ibatch * idist; data[0].imag(0.0); if(length[0] % 2 == 0) { data[istride[0] * (length[0] / 2)].imag(0.0); } if(length[1] % 2 == 0) { data[istride[1] * (length[1] / 2)].imag(0.0); } if(length[2] % 2 == 0) { data[istride[2] * (length[2] / 2)].imag(0.0); } if(length[0] % 2 == 0 && length[1] % 2 == 0) { data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0); } if(length[0] % 2 == 0 && length[2] % 2 == 0) { data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0); } if(length[1] % 2 == 0 && length[2] % 2 == 0) { data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0); } if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0) { data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] .imag(0.0); } // y-axis: for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]); } if(length[0] % 2 == 0) { // y-axis at x-nyquist for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]); } } // x-axis: for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]); } if(length[1] % 2 == 0) { // x-axis at y-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]); } } // x-y plane: for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { for(unsigned int j = 1; j < length[1]; ++j) { data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] = std::conj(data[istride[0] * i + istride[1] * j]); } } if(length[2] % 2 == 0) { // x-axis at z-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]); } if(length[1] % 2 == 0) { // x-axis at yz-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]); } } // y-axis: at z-nyquist for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]); } if(length[0] % 2 == 0) { // y-axis: at xz-nyquist for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j + istride[2] * (length[2] / 2)]); } } // x-y plane: at z-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { for(unsigned int j = 1; j < length[1]; ++j) { data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = std::conj( data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]); } } } } } template static void impose_hermitian_symmetry_planar_3D(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) { auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist; auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; data_imag[0] = 0.0; if(length[0] % 2 == 0) { data_imag[istride[0] * (length[0] / 2)] = 0.0; } if(length[1] % 2 == 0) { data_imag[istride[1] * (length[1] / 2)] = 0.0; } if(length[2] % 2 == 0) { data_imag[istride[2] * (length[2] / 2)] = 0.0; } if(length[0] % 2 == 0 && length[1] % 2 == 0) { data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0; } if(length[0] % 2 == 0 && length[2] % 2 == 0) { data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0; } if(length[1] % 2 == 0 && length[2] % 2 == 0) { data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0; } if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0) { data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0; } // y-axis: for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j]; data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j]; } if(length[0] % 2 == 0) { // y-axis at x-nyquist for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] = data_real[istride[0] * (length[0] / 2) + istride[1] * j]; data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j]; } } // x-axis: for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i]; data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i]; } if(length[1] % 2 == 0) { // x-axis at y-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] = data_real[istride[0] * i + istride[1] * (length[1] / 2)]; data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)]; } } // x-y plane: for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { for(unsigned int j = 1; j < length[1]; ++j) { data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] = data_real[istride[0] * i + istride[1] * j]; data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] = -data_imag[istride[0] * i + istride[1] * j]; } } if(length[2] % 2 == 0) { // x-axis at z-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] = data_real[istride[0] * i + istride[2] * (length[2] / 2)]; data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)]; } if(length[1] % 2 == 0) { // x-axis at yz-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] = data_real[istride[0] * i + istride[2] * (length[2] / 2)]; data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)]; } } // y-axis: at z-nyquist for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = data_real[istride[1] * j + istride[2] * (length[2] / 2)]; data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)]; } if(length[0] % 2 == 0) { // y-axis: at xz-nyquist for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) { data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = data_real[istride[0] * (length[0] / 2) + istride[1] * j + istride[2] * (length[2] / 2)]; data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j + istride[2] * (length[2] / 2)]; } } // x-y plane: at z-nyquist for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) { for(unsigned int j = 1; j < length[1]; ++j) { data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]; data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] = -data_imag[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]; } } } } } template static void generate_random_interleaved_data(std::vector& input, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch) { auto idata = (std::complex*)input[0].data(); size_t i_base = 0; auto partitions = partition_rowmajor(whole_length); for(unsigned int b = 0; b < nbatch; b++, i_base += idist) { #pragma omp parallel for num_threads(partitions.size()) for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; std::mt19937 gen(compute_index(index, whole_stride, i_base)); do { const auto i = compute_index(index, whole_stride, i_base); const Tfloat x = (Tfloat)gen() / (Tfloat)gen.max(); const Tfloat y = (Tfloat)gen() / (Tfloat)gen.max(); const std::complex val(x, y); idata[i] = val; } while(increment_rowmajor(index, length)); } } } template static void generate_interleaved_data(std::vector& input, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch) { auto idata = (std::complex*)input[0].data(); size_t i_base = 0; auto partitions = partition_rowmajor(whole_length); auto unit_stride = make_unit_stride(whole_length); const Tfloat inv_scale = 1.0 / static_cast(count_iters(whole_length) - 1); for(unsigned int b = 0; b < nbatch; b++, i_base += idist) { #pragma omp parallel for num_threads(partitions.size()) for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto val_xy = -0.5 + static_cast(compute_index(index, unit_stride, 0)) * inv_scale; const std::complex val(val_xy, val_xy); const auto i = compute_index(index, whole_stride, i_base); idata[i] = val; } while(increment_rowmajor(index, length)); } } } template static void generate_random_planar_data(std::vector& input, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch) { auto ireal = (Tfloat*)input[0].data(); auto iimag = (Tfloat*)input[1].data(); size_t i_base = 0; auto partitions = partition_rowmajor(whole_length); for(unsigned int b = 0; b < nbatch; b++, i_base += idist) { #pragma omp parallel for num_threads(partitions.size()) for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; std::mt19937 gen(compute_index(index, whole_stride, i_base)); do { const auto i = compute_index(index, whole_stride, i_base); const std::complex val((Tfloat)gen() / (Tfloat)gen.max(), (Tfloat)gen() / (Tfloat)gen.max()); ireal[i] = val.real(); iimag[i] = val.imag(); } while(increment_rowmajor(index, length)); } } } template static void generate_planar_data(std::vector& input, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch) { auto ireal = (Tfloat*)input[0].data(); auto iimag = (Tfloat*)input[1].data(); size_t i_base = 0; auto partitions = partition_rowmajor(whole_length); auto unit_stride = make_unit_stride(whole_length); const Tfloat inv_scale = 1.0 / static_cast(count_iters(whole_length) - 1); for(unsigned int b = 0; b < nbatch; b++, i_base += idist) { #pragma omp parallel for num_threads(partitions.size()) for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto val_xy = -0.5 + static_cast(compute_index(index, unit_stride, 0)) * inv_scale; const auto i = compute_index(index, whole_stride, i_base); ireal[i] = val_xy; iimag[i] = val_xy; } while(increment_rowmajor(index, length)); } } } template static void generate_random_real_data(std::vector& input, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch) { auto idata = (Tfloat*)input[0].data(); size_t i_base = 0; auto partitions = partition_rowmajor(whole_length); for(unsigned int b = 0; b < nbatch; b++, i_base += idist) { #pragma omp parallel for num_threads(partitions.size()) for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; std::mt19937 gen(compute_index(index, whole_stride, i_base)); do { const auto i = compute_index(index, whole_stride, i_base); const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max(); idata[i] = val; } while(increment_rowmajor(index, length)); } } } template static void generate_real_data(std::vector& input, const Tint1& whole_length, const Tint1& whole_stride, const size_t idist, const size_t nbatch) { auto idata = (Tfloat*)input[0].data(); size_t i_base = 0; auto partitions = partition_rowmajor(whole_length); auto unit_stride = make_unit_stride(whole_length); const Tfloat inv_scale = 1.0 / static_cast(count_iters(whole_length) - 1); for(unsigned int b = 0; b < nbatch; b++, i_base += idist) { #pragma omp parallel for num_threads(partitions.size()) for(size_t part = 0; part < partitions.size(); ++part) { auto index = partitions[part].first; const auto length = partitions[part].second; do { const auto i = compute_index(index, whole_stride, i_base); idata[i] = -0.5 + static_cast(compute_index(index, unit_stride, 0)) * inv_scale; } while(increment_rowmajor(index, length)); } } } template static void impose_hermitian_symmetry_interleaved(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { switch(length.size()) { case 1: impose_hermitian_symmetry_interleaved_1D(vals, length, istride, idist, nbatch); break; case 2: impose_hermitian_symmetry_interleaved_2D(vals, length, istride, idist, nbatch); break; case 3: impose_hermitian_symmetry_interleaved_3D(vals, length, istride, idist, nbatch); break; default: throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); } } template static void impose_hermitian_symmetry_planar(std::vector& vals, const std::vector& length, const std::vector& istride, const Tsize idist, const Tsize nbatch) { switch(length.size()) { case 1: impose_hermitian_symmetry_planar_1D(vals, length, istride, idist, nbatch); break; case 2: impose_hermitian_symmetry_planar_2D(vals, length, istride, idist, nbatch); break; case 3: impose_hermitian_symmetry_planar_3D(vals, length, istride, idist, nbatch); break; default: throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); } } #endif // DATA_GEN_HOST_H upstream/shared/rocfft_against_fftw.h0000664000175000017500000002214314637252753017034 0ustar kaolkaol// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef ROCFFT_AGAINST_FFTW #define ROCFFT_AGAINST_FFTW #include #include #include #include #include "fftw_transform.h" // Return the precision enum for rocFFT based upon the type. template inline fft_precision precision_selector(); template <> inline fft_precision precision_selector() { return fft_precision_single; } template <> inline fft_precision precision_selector() { return fft_precision_double; } extern bool use_fftw_wisdom; // construct and return an FFTW plan with the specified type, // precision, and dimensions. cpu_out is required if we're using // wisdom, which runs actual FFTs to work out the best plan. template static typename fftw_trait::fftw_plan_type fftw_plan_with_precision(const std::vector& dims, const std::vector& howmany_dims, const fft_transform_type transformType, const size_t isize, void* cpu_in, void* cpu_out) { using fftw_complex_type = typename fftw_trait::fftw_complex_type; // NB: Using FFTW_MEASURE implies that the input buffer's data // may be destroyed during plan creation. But if we're wanting // to run FFTW in the first place, we must have just created an // uninitialized input buffer anyway. switch(transformType) { case fft_transform_type_complex_forward: return fftw_plan_guru64_dft(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), -1, use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); case fft_transform_type_complex_inverse: return fftw_plan_guru64_dft(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), 1, use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); case fft_transform_type_real_forward: return fftw_plan_guru64_r2c(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); case fft_transform_type_real_inverse: return fftw_plan_guru64_c2r(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); default: throw std::runtime_error("Invalid transform type"); } } // construct an FFTW plan, given rocFFT parameters. output is // required if planning with wisdom. template static typename fftw_trait::fftw_plan_type fftw_plan_via_rocfft(const std::vector& length, const std::vector& istride, const std::vector& ostride, const size_t nbatch, const size_t idist, const size_t odist, const fft_transform_type transformType, std::vector& input, std::vector& output) { // Dimension configuration: std::vector dims(length.size()); for(unsigned int idx = 0; idx < length.size(); ++idx) { dims[idx].n = length[idx]; dims[idx].is = istride[idx]; dims[idx].os = ostride[idx]; } // Batch configuration: std::vector howmany_dims(1); howmany_dims[0].n = nbatch; howmany_dims[0].is = idist; howmany_dims[0].os = odist; return fftw_plan_with_precision(dims, howmany_dims, transformType, idist * nbatch, input.front().data(), output.empty() ? nullptr : output.front().data()); } template void fftw_run(fft_transform_type transformType, typename fftw_trait::fftw_plan_type cpu_plan, std::vector& cpu_in, std::vector& cpu_out) { switch(transformType) { case fft_transform_type_complex_forward: { fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_complex_inverse: { fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_real_forward: { fftw_plan_execute_r2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_real_inverse: { fftw_plan_execute_c2r(cpu_plan, cpu_in, cpu_out); break; } } } // Given a transform type, return the contiguous input type. inline fft_array_type contiguous_itype(const fft_transform_type transformType) { switch(transformType) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: return fft_array_type_complex_interleaved; case fft_transform_type_real_forward: return fft_array_type_real; case fft_transform_type_real_inverse: return fft_array_type_hermitian_interleaved; default: throw std::runtime_error("Invalid transform type"); } return fft_array_type_complex_interleaved; } // Given a transform type, return the contiguous output type. inline fft_array_type contiguous_otype(const fft_transform_type transformType) { switch(transformType) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: return fft_array_type_complex_interleaved; case fft_transform_type_real_forward: return fft_array_type_hermitian_interleaved; case fft_transform_type_real_inverse: return fft_array_type_real; default: throw std::runtime_error("Invalid transform type"); } return fft_array_type_complex_interleaved; } // Given a precision, return the acceptable tolerance. inline double type_epsilon(const fft_precision precision) { switch(precision) { case fft_precision_half: return type_epsilon<_Float16>(); break; case fft_precision_single: return type_epsilon(); break; case fft_precision_double: return type_epsilon(); break; default: throw std::runtime_error("Invalid precision"); } } #endif upstream/shared/increment.h0000664000175000017500000000731314637252753015003 0ustar kaolkaol// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_INCREMENT_H #define ROCFFT_INCREMENT_H #include #include #include // Helper functions to iterate over a buffer in row-major order. // Indexes may be given as either a tuple or vector of sizes. They // return true if the index was successfully incremented to move to // the next element in the buffer. template static bool increment_base(T1& index, const T2& length) { static_assert(std::is_integral::value, "Integral required."); static_assert(std::is_integral::value, "Integral required."); if(index < length - 1) { ++index; return true; } index = 0; return false; } // Increment the index (row-major) for looping over 1, 2, and 3 dimensions length. template static bool increment_rowmajor(T1& index, const T2& length) { static_assert(std::is_integral::value, "Integral required."); static_assert(std::is_integral::value, "Integral required."); return increment_base(index, length); } template static bool increment_rowmajor(std::tuple& index, const std::tuple& length) { if(increment_base(std::get<1>(index), std::get<1>(length))) // we incremented ok, nothing further to do return true; // otherwise, we rolled over return increment_base(std::get<0>(index), std::get<0>(length)); } template static bool increment_rowmajor(std::tuple& index, const std::tuple& length) { if(increment_base(std::get<2>(index), std::get<2>(length))) // we incremented ok, nothing further to do return true; if(increment_base(std::get<1>(index), std::get<1>(length))) // we incremented ok, nothing further to do return true; // otherwise, we rolled over return increment_base(std::get<0>(index), std::get<0>(length)); } // Increment row-major index over arbitrary dimension length template bool increment_rowmajor(std::vector& index, const std::vector& length) { for(int idim = length.size(); idim-- > 0;) { if(index[idim] < length[idim]) { if((++index[idim]) == length[idim]) { index[idim] = 0; continue; } // we know we were able to increment something and didn't hit the end return true; } } // End the loop when we get back to the start: return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; }); } #endif upstream/shared/array_predicate.h0000664000175000017500000000366314637252753016161 0ustar kaolkaol// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_ARRAY_PREDICATE_H #define ROCFFT_ARRAY_PREDICATE_H #include "rocfft/rocfft.h" namespace { bool array_type_is_complex(rocfft_array_type type) { return type == rocfft_array_type_complex_interleaved || type == rocfft_array_type_complex_planar || type == rocfft_array_type_hermitian_interleaved || type == rocfft_array_type_hermitian_planar; } bool array_type_is_interleaved(rocfft_array_type type) { return type == rocfft_array_type_complex_interleaved || type == rocfft_array_type_hermitian_interleaved; } bool array_type_is_planar(rocfft_array_type type) { return type == rocfft_array_type_complex_planar || type == rocfft_array_type_hermitian_planar; } } #endif upstream/shared/precision_type.h0000664000175000017500000000457614637252753016063 0ustar kaolkaol// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_PRECISION_TYPE_H #define ROCFFT_PRECISION_TYPE_H #include "array_predicate.h" #include "rocfft/rocfft.h" static size_t real_type_size(rocfft_precision precision) { switch(precision) { case rocfft_precision_half: return 2; case rocfft_precision_single: return 4; case rocfft_precision_double: return 8; } } static size_t complex_type_size(rocfft_precision precision) { return real_type_size(precision) * 2; } static const char* precision_name(rocfft_precision precision) { switch(precision) { case rocfft_precision_half: return "half"; case rocfft_precision_single: return "single"; case rocfft_precision_double: return "double"; } } static size_t element_size(rocfft_precision precision, rocfft_array_type array_type) { return array_type_is_complex(array_type) ? complex_type_size(precision) : real_type_size(precision); } // offset a pointer by a number of elements, given the elements' // precision and type (complex or not) static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type) { return static_cast(p) + elems * element_size(precision, type); } #endif upstream/shared/hip_object_wrapper.h0000664000175000017500000000503714637252753016666 0ustar kaolkaol/****************************************************************************** * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #ifndef ROCFFT_HIP_OBJ_WRAPPER_H #define ROCFFT_HIP_OBJ_WRAPPER_H #include "rocfft_hip.h" // RAII wrapper around HIP objects template struct hip_object_wrapper_t { hip_object_wrapper_t() : obj(nullptr) { } void alloc() { if(obj == nullptr && TCreate(&obj) != hipSuccess) throw std::runtime_error("hip create failure"); } void free() { if(obj) { (void)TDestroy(obj); obj = nullptr; } } operator const T&() const { return obj; } operator T&() { return obj; } operator bool() const { return obj != nullptr; } ~hip_object_wrapper_t() { free(); } hip_object_wrapper_t(const hip_object_wrapper_t&) = delete; hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete; hip_object_wrapper_t(hip_object_wrapper_t&& other) : obj(other.obj) { other.obj = nullptr; } private: T obj; }; typedef hip_object_wrapper_t hipStream_wrapper_t; typedef hip_object_wrapper_t hipEvent_wrapper_t; #endif // ROCFFT_HIP_OBJ_WRAPPER_H upstream/.githooks/0000775000175000017500000000000014637252753013301 5ustar kaolkaolupstream/.githooks/install0000775000175000017500000000022214637252753014671 0ustar kaolkaol#!/usr/bin/env bash cd $(git rev-parse --git-dir) cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" upstream/.githooks/pre-commit0000775000175000017500000000176614637252753015315 0ustar kaolkaol#!/bin/sh # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. base=/opt/rocm/hcc/bin/clang-format format="" # Redirect output to stderr. exec 1>&2 # check if clang-format is installed type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then echo "$base is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) if git rev-parse --verify HEAD >/dev/null 2>&1 then against=HEAD else # Initial commit: diff against an empty tree object against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then echo "$format $file" "$format" -i -style=file "$file" fi done upstream/docs/0000775000175000017500000000000014637253000012306 5ustar kaolkaolupstream/docs/.sphinx/0000775000175000017500000000000014637253000013675 5ustar kaolkaolupstream/docs/.sphinx/requirements.txt0000664000175000017500000000600414637253000017161 0ustar kaolkaol# # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements.in # accessible-pygments==0.0.3 # via pydata-sphinx-theme alabaster==0.7.13 # via sphinx babel==2.12.1 # via # pydata-sphinx-theme # sphinx beautifulsoup4==4.11.2 # via pydata-sphinx-theme breathe==4.34.0 # via rocm-docs-core certifi==2022.12.7 # via requests cffi==1.15.1 # via # cryptography # pynacl charset-normalizer==3.1.0 # via requests click==8.1.3 # via sphinx-external-toc cryptography==40.0.2 # via pyjwt deprecated==1.2.13 # via pygithub docutils==0.19 # via # breathe # myst-parser # pydata-sphinx-theme # sphinx fastjsonschema==2.18.0 # via rocm-docs-core gitdb==4.0.10 # via gitpython gitpython==3.1.31 # via rocm-docs-core idna==3.4 # via requests imagesize==1.4.1 # via sphinx importlib-metadata==6.8.0 # via sphinx importlib-resources==6.1.0 # via rocm-docs-core jinja2==3.1.2 # via # myst-parser # sphinx markdown-it-py==2.2.0 # via # mdit-py-plugins # myst-parser markupsafe==2.1.2 # via jinja2 mdit-py-plugins==0.3.5 # via myst-parser mdurl==0.1.2 # via markdown-it-py myst-parser==1.0.0 # via rocm-docs-core packaging==23.0 # via # pydata-sphinx-theme # sphinx pycparser==2.21 # via cffi pydata-sphinx-theme==0.13.3 # via # rocm-docs-core # sphinx-book-theme pygithub==1.58.1 # via rocm-docs-core pygments==2.14.0 # via # accessible-pygments # pydata-sphinx-theme # sphinx pyjwt[crypto]==2.6.0 # via # pygithub # pyjwt pynacl==1.5.0 # via pygithub pytz==2023.3.post1 # via babel pyyaml==6.0 # via # myst-parser # rocm-docs-core # sphinx-external-toc requests==2.28.2 # via # pygithub # sphinx rocm-docs-core==0.30.0 # via -r requirements.in smmap==5.0.0 # via gitdb snowballstemmer==2.2.0 # via sphinx soupsieve==2.4 # via beautifulsoup4 sphinx==5.3.0 # via # breathe # myst-parser # pydata-sphinx-theme # rocm-docs-core # sphinx-book-theme # sphinx-copybutton # sphinx-design # sphinx-external-toc # sphinx-notfound-page sphinx-book-theme==1.0.1 # via rocm-docs-core sphinx-copybutton==0.5.1 # via rocm-docs-core sphinx-design==0.4.1 # via rocm-docs-core sphinx-external-toc==0.3.1 # via rocm-docs-core sphinx-notfound-page==0.8.3 # via rocm-docs-core sphinxcontrib-applehelp==1.0.4 # via sphinx sphinxcontrib-devhelp==1.0.2 # via sphinx sphinxcontrib-htmlhelp==2.0.1 # via sphinx sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx typing-extensions==4.5.0 # via pydata-sphinx-theme urllib3==1.26.15 # via requests wrapt==1.15.0 # via deprecated zipp==3.17.0 # via # importlib-metadata # importlib-resources upstream/docs/.sphinx/_toc.yml.in0000664000175000017500000000042214637253000015747 0ustar kaolkaol# Anywhere {branch} is used, the branch name will be substituted. # These comments will also be removed. root: index defaults: numbered: False subtrees: - caption: API entries: - file: api - file: allapi - caption: License entries: - file: license upstream/docs/.sphinx/requirements.in0000664000175000017500000000002714637253000016747 0ustar kaolkaolrocm-docs-core>=0.20.0 upstream/docs/overview.rst0000664000175000017500000000120214637253000014701 0ustar kaolkaol.. toctree:: :maxdepth: 4 :caption: Contents: ====== hipFFT ====== hipFFT is a GPU FFT marshalling library. Currently, hipFFT supports either `rocFFT`_ or `cuFFT`_ as backends. hipFFT exports an interface that does not require the client to change, regardless of the chosen backend. It sits between the application and the backend FFT library, marshalling inputs into the backend and results back to the application. The basic usage pattern is: * create a transform plan (once) * perform (many) transforms using the plan * destroy the plan .. _rocFFT: https://rocfft.readthedocs.io/ .. _cuFFT: https://developer.nvidia.com/cufft upstream/docs/index.rst0000664000175000017500000000062514637253000014152 0ustar kaolkaolWelcome to hipFFT's documentation! ================================== hipFFT is an FFT marshalling library. Currently, hipFFT supports either rocFFT_ or cuFFT_ as backends. .. toctree:: :maxdepth: 4 :caption: Contents: overview Indices and tables ================== * :ref:`genindex` * :ref:`search` .. _rocFFT: https://rocfft.readthedocs.io/ .. _cuFFT: https://developer.nvidia.com/cufft upstream/docs/conf.py0000664000175000017500000000076314637253000013613 0ustar kaolkaol# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html from rocm_docs import ROCmDocs external_projects_current_project = "hipfft" docs_core = ROCmDocs("hipFFT Documentation") docs_core.run_doxygen() docs_core.setup() for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) upstream/docs/.doxygen/0000775000175000017500000000000014637253000014041 5ustar kaolkaolupstream/docs/.doxygen/Doxyfile0000664000175000017500000032211214637253000015550 0ustar kaolkaol# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "hipFFT" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = v1.0.14 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "prototype interfaces compatible with HIPm platform and HiP" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = docBin # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = ../../library/include/hipfft # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = YES # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- SHOW_NAMESPACES = NO # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but # at the end of the doxygen process doxygen will return with a non-zero status. # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = YES #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../library/include/hipfft # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.f90 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf \ *.as \ *.js # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # compiled with the --with-libclang option. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 1 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /