From 3ebc1321e6782919980d3410d3bc527fd77751fc Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Fri, 21 Mar 2025 11:04:26 -0500 Subject: [PATCH v8 2/3] Neon popcount support. --- src/include/port/pg_bitutils.h | 9 ++ src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/pg_bitutils.c | 22 +++- src/port/pg_popcount_aarch64.c | 203 +++++++++++++++++++++++++++++++++ 5 files changed, 230 insertions(+), 6 deletions(-) create mode 100644 src/port/pg_popcount_aarch64.c diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 70bf65c04e4..9aa07e5d574 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num) #endif #endif +/* + * On AArch64, we can use Neon instructions if the compiler provides access to + * them (as indicated by __ARM_NEON). As in simd.h, we assume that all + * available 64-bit hardware has Neon support. + */ +#if defined(__aarch64__) && defined(__ARM_NEON) +#define POPCNT_AARCH64 1 +#endif + #ifdef POPCNT_X86_64 /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); diff --git a/src/port/Makefile b/src/port/Makefile index 4c224319512..cb86b7141e6 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_popcount_aarch64.o \ pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..cad0dd8f4f8 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_popcount_aarch64.c', 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 34904c2fbd9..8b6f20b54e9 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; +/* + * If we are building the Neon versions, we don't need the "slow" fallbacks. + */ +#ifndef POPCNT_AARCH64 static inline int pg_popcount32_slow(uint32 word); static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); +#endif #ifdef POPCNT_X86_64 static bool pg_popcount_available(void); @@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) #endif /* POPCNT_X86_64 */ +/* + * If we are building the Neon versions, we don't need the "slow" fallbacks. + */ +#ifndef POPCNT_AARCH64 /* * pg_popcount32_slow @@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) return popcnt; } -#ifndef POPCNT_X86_64 +#endif /* ! POPCNT_AARCH64 */ + +#if !defined(POPCNT_X86_64) && !defined(POPCNT_AARCH64) /* - * When the POPCNT instruction is not available, there's no point in using + * When special CPU instructions are not available, there's no point in using * function pointers to vary the implementation between the fast and slow - * method. We instead just make these actual external functions when - * POPCNT_X86_64 is not defined. The compiler should be able to inline - * the slow versions here. + * method. We instead just make these actual external functions. The compiler + * should be able to inline the slow versions here. */ int pg_popcount32(uint32 word) @@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) return pg_popcount_masked_slow(buf, bytes, mask); } -#endif /* !POPCNT_X86_64 */ +#endif /* ! POPCNT_X86_64 && ! POPCNT_AARCH64 */ diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c new file mode 100644 index 00000000000..426bae660ef --- /dev/null +++ b/src/port/pg_popcount_aarch64.c @@ -0,0 +1,203 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_aarc64.c + * Holds the AArch64 pg_popcount() implementations. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_aarch64.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_bitutils.h" + +#ifdef POPCNT_AARCH64 + +#include + +/* + * pg_popcount32 + * Return number of 1 bits in word + */ +int +pg_popcount32(uint32 word) +{ + return pg_popcount64((uint64) word); +} + +/* + * pg_popcount64 + * Return number of 1 bits in word + */ +int +pg_popcount64(uint64 word) +{ + return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); +} + +/* + * pg_popcount_optimized + * Returns number of 1 bits in buf + */ +uint64 +pg_popcount_optimized(const char *buf, int bytes) +{ + uint8x16_t vec; + uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); + uint64x2_t accum1 = vdupq_n_u64(0), + accum2 = vdupq_n_u64(0), + accum3 = vdupq_n_u64(0), + accum4 = vdupq_n_u64(0); + uint64 popcnt = 0; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + vec = vld1q_u8((const uint8 *) buf); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + } + + /* + * If enough data remains, do another iteration on a block of two + * registers. + */ + bytes_per_iteration = 2 * sizeof(uint8x16_t); + if (bytes >= bytes_per_iteration) + { + vec = vld1q_u8((const uint8 *) buf); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + bytes -= bytes_per_iteration; + } + + /* + * Add the accumulators. + */ + popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); + popcnt += vaddvq_u64(vaddq_u64(accum3, accum4)); + + /* + * Process remaining 8-byte blocks. + */ + for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) + { + popcnt += pg_popcount64(*((uint64 *) buf)); + buf += sizeof(uint64); + } + + /* + * Process any remaining data byte-by-byte. + */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + +/* + * pg_popcount_masked_optimized + * Returns number of 1 bits in buf after applying the mask to each byte + */ +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + uint8x16_t vec; + uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); + uint64x2_t accum1 = vdupq_n_u64(0), + accum2 = vdupq_n_u64(0), + accum3 = vdupq_n_u64(0), + accum4 = vdupq_n_u64(0); + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + uint8x16_t maskv = vdupq_n_u8(mask); + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + } + + /* + * If enough data remains, do another iteration on a block of two + * registers. + */ + bytes_per_iteration = 2 * sizeof(uint8x16_t); + if (bytes >= bytes_per_iteration) + { + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + bytes -= bytes_per_iteration; + } + + /* + * Add the accumulators. + */ + popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); + popcnt += vaddvq_u64(vaddq_u64(accum3, accum4)); + + /* + * Process remining 8-byte blocks. + */ + for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) + { + popcnt += pg_popcount64(*((uint64 *) buf) & mask64); + buf += sizeof(uint64); + } + + /* + * Process any remaining data byte-by-byte. + */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + +#endif /* POPCNT_AARCH64 */ -- 2.39.5 (Apple Git-154)