From d0f45cfdc0605a377806d359cca9fe3fd7ce75ef Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Jan 2026 13:33:13 +0100 Subject: [PATCH] GH-48897: [C++] Add benchmark for CountSetBits --- cpp/src/arrow/util/bit_util_benchmark.cc | 12 ++++++++++++ cpp/src/arrow/util/bitmap_ops.cc | 11 +++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc b/cpp/src/arrow/util/bit_util_benchmark.cc index ec90fb453fd..da624bec198 100644 --- a/cpp/src/arrow/util/bit_util_benchmark.cc +++ b/cpp/src/arrow/util/bit_util_benchmark.cc @@ -439,6 +439,17 @@ static void SetBitsTo(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * nbytes); } +static void CountSetBits(benchmark::State& state) { + int64_t nbytes = state.range(0); + std::shared_ptr buffer = CreateRandomBuffer(nbytes); + + for (auto _ : state) { + auto count = internal::CountSetBits(buffer->data(), /*bit_offset=*/0, nbytes * 8); + benchmark::DoNotOptimize(count); + } + state.SetBytesProcessed(state.iterations() * nbytes); +} + template static void CopyBitmap(benchmark::State& state) { // NOLINT non-const reference const int64_t buffer_size = state.range(0); @@ -519,6 +530,7 @@ BENCHMARK(ReverseSetBitRunReader)->Apply(SetBitRunReaderPercentageArg); BENCHMARK(VisitBits)->Arg(kBufferSize); BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize); BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17); +BENCHMARK(CountSetBits)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17); #ifdef ARROW_WITH_BENCHMARKS_REFERENCE static void ReferenceNaiveBitmapWriter(benchmark::State& state) { diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc index 6246656ef2a..ce2224f2f66 100644 --- a/cpp/src/arrow/util/bitmap_ops.cc +++ b/cpp/src/arrow/util/bitmap_ops.cc @@ -17,6 +17,7 @@ #include "arrow/util/bitmap_ops.h" +#include #include #include #include @@ -55,13 +56,15 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { constexpr int64_t kCountUnrollFactor = 4; const int64_t words_rounded = bit_util::RoundDown(p.aligned_words, kCountUnrollFactor); - int64_t count_unroll[kCountUnrollFactor] = {0}; + std::array count_unroll{}; // Unroll the loop for better performance for (int64_t i = 0; i < words_rounded; i += kCountUnrollFactor) { - for (int64_t k = 0; k < kCountUnrollFactor; k++) { - count_unroll[k] += bit_util::PopCount(u64_data[k]); - } + // (hand-unrolled as some gcc versions would unnest a nested `for` loop) + count_unroll[0] += bit_util::PopCount(u64_data[0]); + count_unroll[1] += bit_util::PopCount(u64_data[1]); + count_unroll[2] += bit_util::PopCount(u64_data[2]); + count_unroll[3] += bit_util::PopCount(u64_data[3]); u64_data += kCountUnrollFactor; } for (int64_t k = 0; k < kCountUnrollFactor; k++) {