From 3c4661e699f10f68e28215826d8d892b7acfce68 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 30 Apr 2025 11:15:49 +0200 Subject: [PATCH] add OpenMP segmented prefix sum --- omp/components/prefix_sum.hpp | 113 +++++++++++++++++++++++++++++ omp/test/components/CMakeLists.txt | 1 + omp/test/components/prefix_sum.cpp | 74 +++++++++++++++++++ 3 files changed, 188 insertions(+) create mode 100644 omp/components/prefix_sum.hpp create mode 100644 omp/test/components/CMakeLists.txt create mode 100644 omp/test/components/prefix_sum.cpp diff --git a/omp/components/prefix_sum.hpp b/omp/components/prefix_sum.hpp new file mode 100644 index 00000000000..4c2ff0bd5a8 --- /dev/null +++ b/omp/components/prefix_sum.hpp @@ -0,0 +1,113 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_OMP_COMPONENTS_PREFIX_SUM_HPP_ +#define GKO_OMP_COMPONENTS_PREFIX_SUM_HPP_ + +#include +#include +#include +#include + +#include + +#include "core/base/allocator.hpp" +#include "core/base/iterator_factory.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace components { + + +/* + * Similar to prefix_sum, only reduces within runs of the same key value (each + * key run must only occur once, otherwise the scan operation is not necessarily + * associaive). It also doesn't ignore the last value! + * Similar to thrust::exclusive_scan_by_key + */ +template ::value_type>> +void segmented_prefix_sum( + std::shared_ptr exec, KeyIterator key, Iterator it, + const size_type num_entries, + typename std::iterator_traits::value_type key_init = {}, + typename std::iterator_traits::value_type init = {}, + ScanOp op = {}) +{ + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + // the operation only makes sense for arrays of size at least 2 + if (num_entries < 2) { + if (num_entries == 0) { + return; + } else { + *it = init; + return; + } + } + + const int nthreads = omp_get_max_threads(); + vector proc_sums(nthreads, init, {exec}); + vector proc_first_key(nthreads, key_init, {exec}); + vector proc_last_key(nthreads, key_init, {exec}); + const size_type def_num_witems = (num_entries - 1) / nthreads + 1; + +#pragma omp parallel + { + const int thread_id = omp_get_thread_num(); + const size_type startidx = thread_id * def_num_witems; + const size_type endidx = + std::min(num_entries, (thread_id + 1) * def_num_witems); + + auto partial_sum = init; + auto cur_key = startidx < num_entries ? key[startidx] : key_init; + proc_first_key[thread_id] = cur_key; + for (size_type i = startidx; i < endidx; ++i) { + auto value = it[i]; + auto new_key = key[i]; + if (cur_key != new_key) { + partial_sum = init; + cur_key = new_key; + } + it[i] = partial_sum; + partial_sum = op(partial_sum, value); + } + + proc_sums[thread_id] = partial_sum; + proc_last_key[thread_id] = cur_key; + +#pragma omp barrier + +#pragma omp single + { + for (int i = 0; i < nthreads - 1; i++) { + // the next block carries over the previous partial sum + // if it starts and ends with the same key as the next one + if (proc_last_key[i] == proc_first_key[i + 1] && + proc_first_key[i + 1] == proc_last_key[i + 1]) { + proc_sums[i + 1] = op(proc_sums[i], proc_sums[i + 1]); + } + } + } + + if (thread_id > 0) { + for (size_type i = startidx; i < endidx; i++) { + if (key[i] == proc_last_key[thread_id - 1]) { + it[i] = op(it[i], proc_sums[thread_id - 1]); + } + } + } + } +} + + +} // namespace components +} // namespace omp +} // namespace kernels +} // namespace gko + +#endif // GKO_OMP_COMPONENTS_PREFIX_SUM_HPP_ diff --git a/omp/test/components/CMakeLists.txt b/omp/test/components/CMakeLists.txt new file mode 100644 index 00000000000..93aee23a1d2 --- /dev/null +++ b/omp/test/components/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_omp_test(prefix_sum) diff --git a/omp/test/components/prefix_sum.cpp b/omp/test/components/prefix_sum.cpp new file mode 100644 index 00000000000..4069cc53c31 --- /dev/null +++ b/omp/test/components/prefix_sum.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "omp/components/prefix_sum.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/test/utils.hpp" + + +template +class PrefixSum : public ::testing::Test { +protected: + using index_type = T; + + PrefixSum() : exec{gko::OmpExecutor::create()}, rand(293) {} + + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; +}; + +TYPED_TEST_SUITE(PrefixSum, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(PrefixSum, SegmentedPrefixSumWorks) +{ + using index_type = typename TestFixture::index_type; + const auto max_threads = omp_get_max_threads(); + for (int num_threads = 1; num_threads <= max_threads; num_threads++) { + SCOPED_TRACE(num_threads); + omp_set_num_threads(num_threads); + for (int num_ranges : {10, 100, 1000}) { + SCOPED_TRACE(num_ranges); + // repeate multiple times for different random seeds + for (int repetition : gko::irange{10}) { + std::uniform_int_distribution count_dist{0, 100}; + std::uniform_int_distribution value_dist{-200, 200}; + std::vector ref_result; + std::vector keys; + std::vector input; + for (int i = 0; i < num_ranges; i++) { + const auto start = keys.size(); + const auto new_count = count_dist(this->rand); + keys.insert(keys.end(), new_count, i); + std::generate_n(std::back_inserter(input), new_count, + [&] { return value_dist(this->rand); }); + std::copy(input.begin() + start, input.end(), + std::back_inserter(ref_result)); + std::exclusive_scan( + ref_result.begin() + start, ref_result.end(), + ref_result.begin() + start, index_type{}); + } + + gko::kernels::omp::components::segmented_prefix_sum( + this->exec, keys.cbegin(), input.begin(), keys.size()); + + ASSERT_EQ(input, ref_result); + } + } + } +}