Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
1e8517c
Add SSE4.2 implementation
AntoinePrv Oct 27, 2025
2b5d653
Add unpack uint8_t benchmark
AntoinePrv Oct 28, 2025
6727642
Add bool unpack benchmark
AntoinePrv Oct 30, 2025
1115bb0
Bias benchmarks toward small scale
AntoinePrv Nov 25, 2025
356eb91
Add Kernel plan builder
AntoinePrv Oct 20, 2025
ebbe47c
Add simd kernel
AntoinePrv Oct 24, 2025
0b89a10
Handle rshifts on SSE2
AntoinePrv Oct 27, 2025
701bbc7
Use new kernel when possible in generated 128 code
AntoinePrv Oct 27, 2025
f3a631b
Refactor array to xsimd::batch_constant
AntoinePrv Oct 27, 2025
6fba158
Refactor right shift
AntoinePrv Oct 27, 2025
4f0594a
Add oversized plan
AntoinePrv Oct 28, 2025
718e771
Add oversized kernel
AntoinePrv Oct 28, 2025
88dd517
Rename kernels
AntoinePrv Oct 28, 2025
99269c8
Add simd kernel dispatch
AntoinePrv Oct 28, 2025
cd7728b
Call Simd kernel directly
AntoinePrv Oct 28, 2025
69f9da7
Fix SIMD level None
AntoinePrv Oct 29, 2025
778cc8d
Initialize swizzles to -1
AntoinePrv Oct 29, 2025
1a9bceb
Doc
AntoinePrv Oct 29, 2025
7ce108c
Improve test error message
AntoinePrv Oct 29, 2025
c579b32
Use new kernel in avx2
AntoinePrv Oct 28, 2025
a37e8bd
AVX2 swizzle fallback
AntoinePrv Oct 29, 2025
5cbfb88
Remove dead code
AntoinePrv Oct 30, 2025
1e4e806
Simplify Large masks
AntoinePrv Oct 30, 2025
ff2f1f4
Remove bpacking 256 generated file
AntoinePrv Oct 30, 2025
65eddc5
Remove uint8_t fallback
AntoinePrv Oct 30, 2025
baa097b
Add boolean simd implementation
AntoinePrv Oct 30, 2025
0d58ac8
Use std::is_base_of for arch detection
AntoinePrv Oct 30, 2025
54714b2
Improve swizzle
AntoinePrv Nov 17, 2025
d441e91
Only use lshift hack when available
AntoinePrv Nov 17, 2025
69a7f60
Fix return type
AntoinePrv Nov 17, 2025
56a0438
Fix shift included size
AntoinePrv Nov 18, 2025
187eb75
Add Avx2 uint16_t shift fallback
AntoinePrv Nov 19, 2025
303d95c
Refactor make_mult
AntoinePrv Nov 19, 2025
1f70a63
Add Avx2 lshift unint8_t fallback
AntoinePrv Nov 19, 2025
825d769
Refactor right shift excess
AntoinePrv Nov 19, 2025
cd82a5a
Refactor make_mult
AntoinePrv Nov 20, 2025
d8ed902
Add SSE var shift uint8_t fallback to uint16_t
AntoinePrv Nov 20, 2025
d80c117
Implement size reading reduction
AntoinePrv Nov 20, 2025
14cb815
Add fallback Avx2 right shift
AntoinePrv Nov 24, 2025
664df7e
Refactor static dispatch
AntoinePrv Nov 26, 2025
3be30a5
Forward oversized to larger uint when possible
AntoinePrv Nov 26, 2025
bfbb477
Add arch detection functions
AntoinePrv Nov 26, 2025
1875343
Refactor traits usage
AntoinePrv Nov 26, 2025
369aa10
Forward x86_64 unpack64 to unpack32
AntoinePrv Nov 26, 2025
e6744f5
Simplify template usage
AntoinePrv Nov 26, 2025
7e916f6
Reorganize and doc
AntoinePrv Nov 26, 2025
89762f2
Refactor KernelDispatch and remove Oversized dispatch
AntoinePrv Nov 26, 2025
43eb7c5
Forward large unpack8 to unpack16 on SSE2
AntoinePrv Nov 26, 2025
2a7c0ef
Use fallback right shift on large uint8_t avx2
AntoinePrv Nov 26, 2025
352962c
Fix enable_if
AntoinePrv Nov 27, 2025
0d4e759
Add missing header
AntoinePrv Nov 27, 2025
acf6d42
fmt
AntoinePrv Nov 27, 2025
b7e46cd
Add SSE4.2 to dynamic dispatch
AntoinePrv Nov 27, 2025
fc8d85a
Rename bpacking_simd_impl > bpacking_simd_kernel
AntoinePrv Nov 27, 2025
a65088d
Restore modifications to simd_codegen
AntoinePrv Nov 27, 2025
c55e60c
Reduce reading size and declare bytes read
AntoinePrv Nov 27, 2025
76f9a22
Add kBytesRead to scalar code
AntoinePrv Nov 27, 2025
d36ccad
Add kBytesRead to simd 512 generated code
AntoinePrv Nov 27, 2025
bf2fa0d
Prevent overreading
AntoinePrv Nov 27, 2025
d31a7d7
Fix pessimit overeading guard
AntoinePrv Nov 28, 2025
58189bd
Fix overreading guard comparison
AntoinePrv Dec 1, 2025
e618067
Add UnpackOptions and max_read_bytes
AntoinePrv Dec 1, 2025
0dfb83e
Use C++20 NTTP
AntoinePrv Jan 7, 2026
5aa8b36
xsimd 14.0 compatibility
AntoinePrv Jan 8, 2026
249f17b
fmt
AntoinePrv Jan 21, 2026
8163700
C++20 NTTP options
AntoinePrv Jan 23, 2026
a3720d4
Homogenous wording
AntoinePrv Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,19 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
batch_size = static_cast<int>(remaining_bits / num_bits);
}

const ::arrow::internal::UnpackOptions opts{
/* .batch_size= */ batch_size,
/* .bit_width= */ num_bits,
/* .bit_offset= */ bit_offset_,
/* .max_read_bytes= */ max_bytes_ - byte_offset_,
};

if constexpr (std::is_same_v<T, bool>) {
::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
bit_offset_);
::arrow::internal::unpack(buffer_ + byte_offset_, v, opts);

} else {
::arrow::internal::unpack(buffer_ + byte_offset_,
reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
num_bits, bit_offset_);
reinterpret_cast<std::make_unsigned_t<T>*>(v), opts);
}

Advance(batch_size * num_bits);
Expand Down
23 changes: 12 additions & 11 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include <array>

#include "arrow/util/bpacking_dispatch_internal.h"
#include "arrow/util/bpacking_internal.h"
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
Expand All @@ -34,9 +33,11 @@ struct UnpackDynamicFunction {

static constexpr auto implementations() {
return std::array{
// Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable
// rhsift and poor xsimd fallback.
#if defined(ARROW_HAVE_SSE4_2)
Implementation{DispatchLevel::NONE, &unpack_sse4_2<Uint>},
#else
Implementation{DispatchLevel::NONE, &unpack_scalar<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
Implementation{DispatchLevel::AVX2, &unpack_avx2<Uint>},
#endif
Expand All @@ -50,19 +51,19 @@ struct UnpackDynamicFunction {
} // namespace

template <typename Uint>
void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
#if defined(ARROW_HAVE_NEON)
return unpack_neon(in, out, batch_size, num_bits, bit_offset);
return unpack_neon(in, out, opts);
#else
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
return dispatch.func(in, out, batch_size, num_bits, bit_offset);
return dispatch.func(in, out, opts);
#endif
}

template void unpack<bool>(const uint8_t*, bool*, int, int, int);
template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
template void unpack<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);

} // namespace arrow::internal
93 changes: 85 additions & 8 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <stdexcept>
#include <vector>

Expand All @@ -33,7 +34,7 @@ namespace arrow::internal {
namespace {

template <typename Int>
using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
using UnpackFunc = void (*)(const uint8_t*, Int*, const UnpackOptions&);

/// Get the number of bytes associate with a packing.
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
Expand Down Expand Up @@ -86,33 +87,62 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
const uint8_t* packed_ptr =
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);

std::vector<Int> unpacked(num_values, 0);
auto unpacked = std::make_unique<Int[]>(num_values);

const ::arrow::internal::UnpackOptions opts{
/* .batch_size= */ num_values,
/* .bit_width= */ bit_width,
/* .bit_offset= */ 0,
/* .max_read_bytes= */ -1,
};

for (auto _ : state) {
unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
unpack(packed_ptr, unpacked.get(), opts);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(num_values * state.iterations());
}

constexpr int32_t kMinRange = 64;
constexpr int32_t kMaxRange = 32768;
/// Currently, the minimum unpack SIMD kernel size is 32 and the bit packing encoder will
/// not emit runs larger than 512 (though other implementation might), so we biased the
/// benchmarks towards a rather small scale.
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};

static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues8 = {
kBitWidths8,
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues16 = {
kBitWidths16,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
kBitWidths32,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
kBitWidths64,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};

/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc<bool> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<bool>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc<uint8_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint8_t>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc<uint16_t> unpack,
bool skip = false, std::string skip_msg = "") {
Expand All @@ -129,14 +159,39 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
}

BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &unpack_scalar<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, &unpack_scalar<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);

#if defined(ARROW_HAVE_SSE4_2)
BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &unpack_sse4_2<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, &unpack_sse4_2<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX2)
BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &unpack_avx2<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &unpack_avx2<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
Expand All @@ -152,6 +207,14 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2<uint64_t>,
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX512)
BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &unpack_avx512<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, &unpack_avx512<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
Expand All @@ -167,6 +230,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512<uint64
#endif

#if defined(ARROW_HAVE_NEON)
BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &unpack_neon<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &unpack_neon<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon<uint32_t>)
Expand All @@ -175,6 +242,16 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);

BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);

BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
Expand Down
Loading
Loading