pax_global_header00006660000000000000000000000064136517521170014522gustar00rootroot0000000000000052 comment=509b28a092f732a068e2908bb9e359a8562cd32f paryfor-0.1/000077500000000000000000000000001365175211700130445ustar00rootroot00000000000000paryfor-0.1/.gitignore000066400000000000000000000000401365175211700150260ustar00rootroot00000000000000*~ bin/ build/ experiments/ *#* paryfor-0.1/CMakeLists.txt000066400000000000000000000022771365175211700156140ustar00rootroot00000000000000# Specify the minimum version for CMake cmake_minimum_required(VERSION 3.1) # Project's name project(paryfor) # We build using c++14 set(CMAKE_CXX_STANDARD 14) #set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) # Use all standard-compliant optimizations set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -mcx16 -g") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -mcx16 -g") # Set the output folder where your program will be created set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) set(LIBRARY_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/lib) # The following folder will be included include_directories("${PROJECT_SOURCE_DIR}") # set up our target executable and specify its dependencies and includes add_executable(paryfor-test ${CMAKE_SOURCE_DIR}/test.cpp ) target_include_directories(paryfor-test PUBLIC "${PROJECT_SOURCE_DIR}") target_link_libraries(paryfor-test "-latomic" Threads::Threads) # this was hard to track down # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why set(CMAKE_EXE_LINKER_FLAGS "-static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive") set(CMAKE_BUILD_TYPE Release) paryfor-0.1/LICENSE000066400000000000000000000020561365175211700140540ustar00rootroot00000000000000MIT License Copyright (c) 2020 Erik Garrison Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. paryfor-0.1/LICENSE.atomic_queue000066400000000000000000000020611365175211700165270ustar00rootroot00000000000000MIT License Copyright (c) 2019 Maxim Egorushkin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. paryfor-0.1/README.md000066400000000000000000000017621365175211700143310ustar00rootroot00000000000000# paryfor a parallel_for implementation based on atomic queues ## usage The `parallel_for` templates replace `#pragma omp parallel for`. We use a callback that takes the iteration number. To avoid compiler confusion, we can specify the iteration type in the template. ```c++ #include "paryfor.hpp" int main(int argc, char** argv) { uint64_t todo_count = std::stoul(argv[1]); int thread_count = std::stoi(argv[2]); int chunk_size = std::stoi(argv[3]); paryfor::parallel_for( 0, todo_count, thread_count, chunk_size, [&](uint64_t i, int tid) { // do work }); } ``` We don't need to use `chunk_size`. If omitted, we use a template that queues single iteration values, rather than ranges. We can also pass a function that does not take its thread id as an argument. ## thanks This utility depends on Maxim Egorushkin's atomic_queue library, which is included in the single header file `paryfor.hpp`. ## author Erik Garrison ## license MIT paryfor-0.1/paryfor.hpp000066400000000000000000000723711365175211700152510ustar00rootroot00000000000000#pragma once // atomic_queue // Copyright (c) 2019 Maxim Egorushkin. MIT License // see LICENSE.atomic_queue for full license #include #include #include #include #include #include #include #include #include #include #include #if defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86) #include namespace paryfor { namespace atomic_queue { constexpr int CACHE_LINE_SIZE = 64; static inline void spin_loop_pause() noexcept { _mm_pause(); } } // namespace atomic_queue } // namespace paryfor #elif defined(__arm__) || defined(__aarch64__) // TODO: These need to be verified as I do not have access to ARM platform. namespace paryfor { namespace atomic_queue { constexpr int CACHE_LINE_SIZE = 64; static inline void spin_loop_pause() noexcept { #if (defined(__ARM_ARCH_6K__) || \ defined(__ARM_ARCH_6Z__) || \ defined(__ARM_ARCH_6ZK__) || \ defined(__ARM_ARCH_6T2__) || \ defined(__ARM_ARCH_7__) || \ defined(__ARM_ARCH_7A__) || \ defined(__ARM_ARCH_7R__) || \ defined(__ARM_ARCH_7M__) || \ defined(__ARM_ARCH_7S__) || \ defined(__ARM_ARCH_8A__) || \ defined(__aarch64__)) asm volatile ("yield" ::: "memory"); #else asm volatile ("nop" ::: "memory"); #endif } } // namespace atomic_queue } // namespace paryfor #else #error "Unknown CPU architecture." #endif //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// namespace paryfor { namespace atomic_queue { #if defined(__GNUC__) || defined(__clang__) #define ATOMIC_QUEUE_LIKELY(expr) __builtin_expect(static_cast(expr), 1) #define ATOMIC_QUEUE_UNLIKELY(expr) __builtin_expect(static_cast(expr), 0) #else #define ATOMIC_QUEUE_LIKELY(expr) expr #define ATOMIC_QUEUE_UNLIKELY(expr) expr #endif //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// auto constexpr A = std::memory_order_acquire; auto constexpr R = std::memory_order_release; auto constexpr X = std::memory_order_relaxed; auto constexpr C = std::memory_order_seq_cst; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace atomic_queue } // namespace paryfor //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// namespace paryfor { namespace atomic_queue { using std::uint32_t; using std::uint64_t; using std::uint8_t; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// namespace details { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template struct GetCacheLineIndexBits { static int constexpr value = 0; }; template<> struct GetCacheLineIndexBits<64> { static int constexpr value = 6; }; template<> struct GetCacheLineIndexBits<32> { static int constexpr value = 5; }; template<> struct GetCacheLineIndexBits<16> { static int constexpr value = 4; }; template<> struct GetCacheLineIndexBits< 8> { static int constexpr value = 3; }; template<> struct GetCacheLineIndexBits< 4> { static int constexpr value = 2; }; template<> struct GetCacheLineIndexBits< 2> { static int constexpr value = 1; }; template struct GetIndexShuffleBits { static int constexpr bits = GetCacheLineIndexBits::value; static unsigned constexpr min_size = 1u << (bits * 2); static int constexpr value = array_size < min_size ? 0 : bits; }; template struct GetIndexShuffleBits { static int constexpr value = 0; }; // Multiple writers/readers contend on the same cache line when storing/loading elements at // subsequent indexes, aka false sharing. For power of 2 ring buffer size it is possible to re-map // the index in such a way that each subsequent element resides on another cache line, which // minimizes contention. This is done by swapping the lowest order N bits (which are the index of // the element within the cache line) with the next N bits (which are the index of the cache line) // of the element index. template constexpr unsigned remap_index(unsigned index) noexcept { constexpr unsigned MASK = (1u << BITS) - 1; unsigned mix = (index ^ (index >> BITS)) & MASK; return index ^ mix ^ (mix << BITS); } template<> constexpr unsigned remap_index<0>(unsigned index) noexcept { return index; } template constexpr T& map(T* elements, unsigned index) noexcept { index = remap_index(index); return elements[index]; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Implement a "bit-twiddling hack" for finding the next power of 2 // in either 32 bits or 64 bits in C++11 compatible constexpr functions // "Runtime" version for 32 bits // --a; // a |= a >> 1; // a |= a >> 2; // a |= a >> 4; // a |= a >> 8; // a |= a >> 16; // ++a; template constexpr T decrement(T x) { return x - 1; } template constexpr T increment(T x) { return x + 1; } template constexpr T or_equal(T x, unsigned u) { return (x | x >> u); } template constexpr T or_equal(T x, unsigned u, Args... rest) { return or_equal(or_equal(x, u), rest...); } constexpr uint32_t round_up_to_power_of_2(uint32_t a) noexcept { return increment(or_equal(decrement(a), 1, 2, 4, 8, 16)); } constexpr uint64_t round_up_to_power_of_2(uint64_t a) noexcept { return increment(or_equal(decrement(a), 1, 2, 4, 8, 16, 32)); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace details //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AtomicQueueCommon { protected: // Put these on different cache lines to avoid false sharing between readers and writers. alignas(CACHE_LINE_SIZE) std::atomic head_ = {}; alignas(CACHE_LINE_SIZE) std::atomic tail_ = {}; // The special member functions are not thread-safe. AtomicQueueCommon() noexcept = default; AtomicQueueCommon(AtomicQueueCommon const& b) noexcept : head_(b.head_.load(X)) , tail_(b.tail_.load(X)) {} AtomicQueueCommon& operator=(AtomicQueueCommon const& b) noexcept { head_.store(b.head_.load(X), X); tail_.store(b.tail_.load(X), X); return *this; } void swap(AtomicQueueCommon& b) noexcept { unsigned h = head_.load(X); unsigned t = tail_.load(X); head_.store(b.head_.load(X), X); tail_.store(b.tail_.load(X), X); b.head_.store(h, X); b.tail_.store(t, X); } template static T do_pop_atomic(std::atomic& q_element) noexcept { if(Derived::spsc_) { for(;;) { T element = q_element.load(X); if(ATOMIC_QUEUE_LIKELY(element != NIL)) { q_element.store(NIL, R); return element; } if(Derived::maximize_throughput_) spin_loop_pause(); } } else { for(;;) { T element = q_element.exchange(NIL, R); // (2) The store to wait for. if(ATOMIC_QUEUE_LIKELY(element != NIL)) return element; // Do speculative loads while busy-waiting to avoid broadcasting RFO messages. do spin_loop_pause(); while(Derived::maximize_throughput_ && q_element.load(X) == NIL); } } } template static void do_push_atomic(T element, std::atomic& q_element) noexcept { assert(element != NIL); if(Derived::spsc_) { while(ATOMIC_QUEUE_UNLIKELY(q_element.load(X) != NIL)) if(Derived::maximize_throughput_) spin_loop_pause(); q_element.store(element, R); } else { for(T expected = NIL; ATOMIC_QUEUE_UNLIKELY(!q_element.compare_exchange_strong(expected, element, R, X)); expected = NIL) { do spin_loop_pause(); // (1) Wait for store (2) to complete. while(Derived::maximize_throughput_ && q_element.load(X) != NIL); } } } enum State : unsigned char { EMPTY, STORING, STORED, LOADING }; template static T do_pop_any(std::atomic& state, T& q_element) noexcept { if(Derived::spsc_) { while(ATOMIC_QUEUE_UNLIKELY(state.load(A) != STORED)) if(Derived::maximize_throughput_) spin_loop_pause(); T element{std::move(q_element)}; state.store(EMPTY, R); return element; } else { for(;;) { unsigned char expected = STORED; if(ATOMIC_QUEUE_LIKELY(state.compare_exchange_strong(expected, LOADING, X, X))) { T element{std::move(q_element)}; state.store(EMPTY, R); return element; } // Do speculative loads while busy-waiting to avoid broadcasting RFO messages. do spin_loop_pause(); while(Derived::maximize_throughput_ && state.load(X) != STORED); } } } template static void do_push_any(U&& element, std::atomic& state, T& q_element) noexcept { if(Derived::spsc_) { while(ATOMIC_QUEUE_UNLIKELY(state.load(A) != EMPTY)) if(Derived::maximize_throughput_) spin_loop_pause(); q_element = std::forward(element); state.store(STORED, R); } else { for(;;) { unsigned char expected = EMPTY; if(ATOMIC_QUEUE_LIKELY(state.compare_exchange_strong(expected, STORING, X, X))) { q_element = std::forward(element); state.store(STORED, R); return; } // Do speculative loads while busy-waiting to avoid broadcasting RFO messages. do spin_loop_pause(); while(Derived::maximize_throughput_ && state.load(X) != EMPTY); } } } public: template bool try_push(T&& element) noexcept { auto head = head_.load(X); if(Derived::spsc_) { if(static_cast(head - tail_.load(X)) >= static_cast(static_cast(*this).size_)) return false; head_.store(head + 1, X); } else { do { if(static_cast(head - tail_.load(X)) >= static_cast(static_cast(*this).size_)) return false; } while(ATOMIC_QUEUE_UNLIKELY(!head_.compare_exchange_strong(head, head + 1, A, X))); // This loop is not FIFO. } static_cast(*this).do_push(std::forward(element), head); return true; } template bool try_pop(T& element) noexcept { auto tail = tail_.load(X); if(Derived::spsc_) { if(static_cast(head_.load(X) - tail) <= 0) return false; tail_.store(tail + 1, X); } else { do { if(static_cast(head_.load(X) - tail) <= 0) return false; } while(ATOMIC_QUEUE_UNLIKELY(!tail_.compare_exchange_strong(tail, tail + 1, A, X))); // This loop is not FIFO. } element = static_cast(*this).do_pop(tail); return true; } template void push(T&& element) noexcept { unsigned head; if(Derived::spsc_) { head = head_.load(X); head_.store(head + 1, X); } else { constexpr auto memory_order = Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_acquire; head = head_.fetch_add(1, memory_order); // FIFO and total order on Intel regardless, as of 2019. } static_cast(*this).do_push(std::forward(element), head); } auto pop() noexcept { unsigned tail; if(Derived::spsc_) { tail = tail_.load(X); tail_.store(tail + 1, X); } else { constexpr auto memory_order = Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_acquire; tail = tail_.fetch_add(1, memory_order); // FIFO and total order on Intel regardless, as of 2019. } return static_cast(*this).do_pop(tail); } bool was_empty() const noexcept { return static_cast(head_.load(X) - tail_.load(X)) <= 0; } bool was_full() const noexcept { return static_cast(head_.load(X) - tail_.load(X)) >= static_cast(static_cast(*this).size_); } unsigned capacity() const noexcept { return static_cast(*this).size_; } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AtomicQueue : public AtomicQueueCommon> { using Base = AtomicQueueCommon>; friend Base; static constexpr unsigned size_ = MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE; static constexpr int SHUFFLE_BITS = details::GetIndexShuffleBits)>::value; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; alignas(CACHE_LINE_SIZE) std::atomic elements_[size_] = {}; // Empty elements are NIL. T do_pop(unsigned tail) noexcept { std::atomic& q_element = details::map(elements_, tail % size_); return Base::template do_pop_atomic(q_element); } void do_push(T element, unsigned head) noexcept { std::atomic& q_element = details::map(elements_, head % size_); Base::template do_push_atomic(element, q_element); } public: using value_type = T; AtomicQueue() noexcept { assert(std::atomic{NIL}.is_lock_free()); // This queue is for atomic elements only. AtomicQueue2 is for non-atomic ones. if(T{} != NIL) for(auto& element : elements_) element.store(NIL, X); } AtomicQueue(AtomicQueue const&) = delete; AtomicQueue& operator=(AtomicQueue const&) = delete; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AtomicQueue2 : public AtomicQueueCommon> { using Base = AtomicQueueCommon>; using State = typename Base::State; friend Base; static constexpr unsigned size_ = MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE; static constexpr int SHUFFLE_BITS = details::GetIndexShuffleBits::value; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; alignas(CACHE_LINE_SIZE) std::atomic states_[size_] = {}; alignas(CACHE_LINE_SIZE) T elements_[size_] = {}; T do_pop(unsigned tail) noexcept { unsigned index = details::remap_index(tail % size_); return Base::template do_pop_any(states_[index], elements_[index]); } template void do_push(U&& element, unsigned head) noexcept { unsigned index = details::remap_index(head % size_); Base::template do_push_any(std::forward(element), states_[index], elements_[index]); } public: using value_type = T; AtomicQueue2() noexcept = default; AtomicQueue2(AtomicQueue2 const&) = delete; AtomicQueue2& operator=(AtomicQueue2 const&) = delete; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template, T NIL = T{}, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false> class AtomicQueueB : public AtomicQueueCommon>, private std::allocator_traits::template rebind_alloc> { using Base = AtomicQueueCommon>; friend Base; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; using AllocatorElements = typename std::allocator_traits::template rebind_alloc>; static constexpr auto ELEMENTS_PER_CACHE_LINE = CACHE_LINE_SIZE / sizeof(std::atomic); static_assert(ELEMENTS_PER_CACHE_LINE, "Unexpected ELEMENTS_PER_CACHE_LINE."); static constexpr auto SHUFFLE_BITS = details::GetCacheLineIndexBits::value; static_assert(SHUFFLE_BITS, "Unexpected SHUFFLE_BITS."); // AtomicQueueCommon members are stored into by readers and writers. // Allocate these immutable members on another cache line which never gets invalidated by stores. alignas(CACHE_LINE_SIZE) unsigned size_; std::atomic* elements_; T do_pop(unsigned tail) noexcept { std::atomic& q_element = details::map(elements_, tail & (size_ - 1)); return Base::template do_pop_atomic(q_element); } void do_push(T element, unsigned head) noexcept { std::atomic& q_element = details::map(elements_, head & (size_ - 1)); Base::template do_push_atomic(element, q_element); } public: using value_type = T; // The special member functions are not thread-safe. AtomicQueueB(unsigned size) : size_(std::max(details::round_up_to_power_of_2(size), 1u << (SHUFFLE_BITS * 2))) , elements_(AllocatorElements::allocate(size_)) { assert(std::atomic{NIL}.is_lock_free()); // This queue is for atomic elements only. AtomicQueueB2 is for non-atomic ones. for(auto p = elements_, q = elements_ + size_; p < q; ++p) p->store(NIL, X); } AtomicQueueB(AtomicQueueB&& b) noexcept : Base(static_cast(b)) , AllocatorElements(static_cast(b)) // TODO: This must be noexcept, static_assert that. , size_(b.size_) , elements_(b.elements_) { b.size_ = 0; b.elements_ = 0; } AtomicQueueB& operator=(AtomicQueueB&& b) noexcept { b.swap(*this); return *this; } ~AtomicQueueB() noexcept { if(elements_) AllocatorElements::deallocate(elements_, size_); // TODO: This must be noexcept, static_assert that. } void swap(AtomicQueueB& b) noexcept { using std::swap; this->Base::swap(b); swap(static_cast(*this), static_cast(b)); swap(size_, b.size_); swap(elements_, b.elements_); } friend void swap(AtomicQueueB& a, AtomicQueueB& b) { a.swap(b); } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false> class AtomicQueueB2 : public AtomicQueueCommon>, private A, private std::allocator_traits::template rebind_alloc> { using Base = AtomicQueueCommon>; using State = typename Base::State; friend Base; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; using AllocatorElements = A; using AllocatorStates = typename std::allocator_traits::template rebind_alloc>; // AtomicQueueCommon members are stored into by readers and writers. // Allocate these immutable members on another cache line which never gets invalidated by stores. alignas(CACHE_LINE_SIZE) unsigned size_; std::atomic* states_; T* elements_; static constexpr auto STATES_PER_CACHE_LINE = CACHE_LINE_SIZE / sizeof(State); static_assert(STATES_PER_CACHE_LINE, "Unexpected STATES_PER_CACHE_LINE."); static constexpr auto SHUFFLE_BITS = details::GetCacheLineIndexBits::value; static_assert(SHUFFLE_BITS, "Unexpected SHUFFLE_BITS."); T do_pop(unsigned tail) noexcept { unsigned index = details::remap_index(tail & (size_ - 1)); return Base::template do_pop_any(states_[index], elements_[index]); } template void do_push(U&& element, unsigned head) noexcept { unsigned index = details::remap_index(head & (size_ - 1)); Base::template do_push_any(std::forward(element), states_[index], elements_[index]); } public: using value_type = T; // The special member functions are not thread-safe. AtomicQueueB2(unsigned size) : size_(std::max(details::round_up_to_power_of_2(size), 1u << (SHUFFLE_BITS * 2))) , states_(AllocatorStates::allocate(size_)) , elements_(AllocatorElements::allocate(size_)) { for(auto p = states_, q = states_ + size_; p < q; ++p) p->store(Base::EMPTY, X); AllocatorElements& ae = *this; for(auto p = elements_, q = elements_ + size_; p < q; ++p) std::allocator_traits::construct(ae, p); } AtomicQueueB2(AtomicQueueB2&& b) noexcept : Base(static_cast(b)) , AllocatorElements(static_cast(b)) // TODO: This must be noexcept, static_assert that. , AllocatorStates(static_cast(b)) // TODO: This must be noexcept, static_assert that. , size_(b.size_) , states_(b.states_) , elements_(b.elements_) { b.size_ = 0; b.states_ = 0; b.elements_ = 0; } AtomicQueueB2& operator=(AtomicQueueB2&& b) noexcept { b.swap(*this); return *this; } ~AtomicQueueB2() noexcept { if(elements_) { AllocatorElements& ae = *this; for(auto p = elements_, q = elements_ + size_; p < q; ++p) std::allocator_traits::destroy(ae, p); AllocatorElements::deallocate(elements_, size_); // TODO: This must be noexcept, static_assert that. AllocatorStates::deallocate(states_, size_); // TODO: This must be noexcept, static_assert that. } } void swap(AtomicQueueB2& b) noexcept { using std::swap; this->Base::swap(b); swap(static_cast(*this), static_cast(b)); swap(static_cast(*this), static_cast(b)); swap(size_, b.size_); swap(states_, b.states_); swap(elements_, b.elements_); } friend void swap(AtomicQueueB2& a, AtomicQueueB2& b) noexcept { a.swap(b); } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template struct RetryDecorator : Queue { using T = typename Queue::value_type; using Queue::Queue; void push(T element) noexcept { while(!this->try_push(element)) spin_loop_pause(); } T pop() noexcept { T element; while(!this->try_pop(element)) spin_loop_pause(); return element; } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace atomic_queue } // namespace paryfor //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // paryfor parallel for implementation // Copyright (c) 2020 Erik Garrison. MIT License // see LICENSE for details namespace paryfor { using std::uint32_t; using std::uint64_t; using std::uint8_t; template void parallel_for(const I& begin, const I& end, const uint64_t& nthreads, const std::function& func) { auto queue_ptr = new atomic_queue::AtomicQueue2; auto& queue = *queue_ptr; std::atomic work_todo; auto worker = [&queue,&work_todo,&func](int thread_id) { I i; while (work_todo.load()) { if (queue.try_pop(i)) { func(i, thread_id); } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } }; std::vector workers; workers.reserve(nthreads); work_todo.store(true); for (uint64_t t = 0; t < nthreads; ++t) { workers.emplace_back(worker, t); } I todo_i = begin; while (todo_i != end) { if (queue.try_push(todo_i)) { ++todo_i; } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } while (!queue.was_empty()) { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } work_todo.store(false); for (uint64_t t = 0; t < nthreads; ++t) { workers[t].join(); } delete queue_ptr; } // specialization where we don't use the thread id template void parallel_for(const I& begin, const I& end, const uint64_t& nthreads, const std::function& func) { parallel_for(begin, end, nthreads, [&func](I i, int id) { func(i); }); } template void parallel_for(const I& begin, const I& end, const uint64_t& nthreads, const uint64_t& chunk_size, const std::function& func) { auto queue_ptr = new atomic_queue::AtomicQueue2, 2 << 16>; auto& queue = *queue_ptr; std::atomic work_todo; auto worker = [&queue,&work_todo,&func](int thread_id) { std::pair p; while (work_todo.load()) { if (queue.try_pop(p)) { for (I i = p.first; i < p.second; ++i) { func(i, thread_id); } } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } }; std::vector workers; workers.reserve(nthreads); work_todo.store(true); for (uint64_t t = 0; t < nthreads; ++t) { workers.emplace_back(worker, t); } std::pair todo_range = std::make_pair(begin, std::min(begin + chunk_size, end)); I& todo_i = todo_range.first; I& todo_j = todo_range.second; while (todo_i != end) { if (queue.try_push(todo_range)) { todo_i = std::min(todo_i + chunk_size, end); todo_j = std::min(todo_j + chunk_size, end); } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } while (!queue.was_empty()) { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } work_todo.store(false); for (uint64_t t = 0; t < nthreads; ++t) { workers[t].join(); } delete queue_ptr; } // specialization where we don't use the thread id template void parallel_for(const I& begin, const I& end, const uint64_t& nthreads, const uint64_t& chunk_size, const std::function& func) { parallel_for(begin, end, nthreads, chunk_size, [&func](I i, int id) { func(i); }); } } paryfor-0.1/test.cpp000066400000000000000000000020211365175211700145220ustar00rootroot00000000000000#include #include #include #include #include "paryfor.hpp" int main(int argc, char** argv) { // just test that we can compile uint64_t todo_count = std::stoul(argv[1]); int thread_count = std::stoi(argv[2]); int chunk_size = std::stoi(argv[3]); std::vector count(thread_count); std::mutex count_mutex; paryfor::parallel_for( 0, todo_count, thread_count, chunk_size, [&](uint64_t i, int tid) { // do some trivial work, so that we can see the effect of multithreading for (uint64_t j = 0; j < 100; ++j) { i *= exp(i) * exp(i+i) / log(i+i+i);; } ++count[tid]; }); uint64_t c = 0; for (int i = 0; i < thread_count; ++i) { std::cout << "thread " << i << " " << count[i] << std::endl; c += count[i]; } if (c != todo_count) { std::cerr << "error: count does not match that requested" << std::endl; return 1; } return 0; }