From 36f954a5735911af3e057f24d8803c32819e738d Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Fri, 21 Mar 2025 20:24:44 -0500 Subject: [PATCH v8 3/3] SVE popcount support. --- config/c-compiler.m4 | 64 +++++++++ configure | 84 ++++++++++++ configure.ac | 9 ++ meson.build | 61 +++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 17 +++ src/port/pg_popcount_aarch64.c | 235 ++++++++++++++++++++++++++++++++- 7 files changed, 467 insertions(+), 6 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 3712e81e38c..d1e7461f6f6 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -708,3 +708,67 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_SVE_POPCNT_INTRINSICS +# -------------------------- +# Check if the compiler supports the SVE popcount instructions using the +# svptrue_b64, svdup_u64, svcntb, svld1, svadd_x, svcnt_x, svaddv, +# svwhilelt_b8, and svand_x intrinsic functions. +# +# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics. +AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include + + char buf[500]; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int popcount_test(void) + { + uint32_t vec_len = svcntb(); + int bytes = sizeof(buf); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64_t popcnt = 0, + mask = 0x5555555555555555; + char *p = buf; + + for (; bytes >= vec_len * 2; bytes -= vec_len * 2) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + p += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + p += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + p += vec_len; + } + + return (int) popcnt; + }]], + [return popcount_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_sve_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_SVE_POPCNT_INTRINSICS diff --git a/configure b/configure index fac1e9a4e39..85f4b24caaa 100755 --- a/configure +++ b/configure @@ -17378,6 +17378,90 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check for SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5 +$as_echo_n "checking for svcnt_x... " >&6; } +if ${pgac_cv_sve_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + + char buf[500]; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int popcount_test(void) + { + uint32_t vec_len = svcntb(); + int bytes = sizeof(buf); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64_t popcnt = 0, + mask = 0x5555555555555555; + char *p = buf; + + for (; bytes >= vec_len * 2; bytes -= vec_len * 2) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + p += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + p += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + p += vec_len; + } + + return (int) popcnt; + } +int +main () +{ +return popcount_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_sve_popcnt_intrinsics=yes +else + pgac_cv_sve_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then + pgac_sve_popcnt_intrinsics=yes +fi + + if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index b6d02f5ecc7..64b52940658 100644 --- a/configure.ac +++ b/configure.ac @@ -2057,6 +2057,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_SVE_POPCNT_INTRINSICS() + if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE popcount instructions with a runtime check.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index 7cf518a2765..de7e695ab6f 100644 --- a/meson.build +++ b/meson.build @@ -2285,6 +2285,67 @@ int main(void) endif +############################################################### +# Check for the availability of SVE popcount intrinsics. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include + +char buf[500]; + +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("arch=armv8-a+sve"))) +#endif +int main(void) +{ + uint32_t vec_len = svcntb(); + int bytes = sizeof(buf); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64_t popcnt = 0, + mask = 0x5555555555555555; + char *p = buf; + + for (; bytes >= vec_len * 2; bytes -= vec_len * 2) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + p += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + p += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + p += vec_len; + } + + return (int) popcnt; +} +''' + + if cc.links(prog, name: 'SVE popcount', args: test_c_args) + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index db6454090d2..2a67db077a9 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -706,6 +706,9 @@ /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */ #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to build with systemd support. (--with-systemd) */ #undef USE_SYSTEMD diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 9aa07e5d574..1bcb4ecb8ab 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +#elif POPCNT_AARCH64 +/* Use the Neon version of pg_popcount{32,64} without function pointer. */ +extern int pg_popcount32(uint32 word); +extern int pg_popcount64(uint64 word); + +/* + * We can try to use an SVE-optimized pg_popcount() on some systems For that, + * we do use a function pointer. + */ +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); +#else +extern uint64 pg_popcount_optimized(const char *buf, int bytes); +extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index 426bae660ef..48441269639 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -18,6 +18,229 @@ #include +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +#include + +#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) +#include +#endif +#endif + +/* + * The Neon versions are built regardless of whether we are building the SVE + * versions. + */ +static uint64 pg_popcount_neon(const char *buf, int bytes); +static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask); + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +/* + * These are the SVE implementations of the popcount functions. + */ +static uint64 pg_popcount_sve(const char *buf, int bytes); +static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); + +/* + * The function pointers are initially set to "choose" functions. These + * functions will first set the pointers to the right implementations (based on + * what the current CPU supports) and then will call the pointer to fulfill the + * caller's request. + */ +static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; + +static inline bool +pg_popcount_sve_available(void) +{ +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_SVE) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +#else + return false; +#endif +} + +static inline void +choose_popcount_functions(void) +{ + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } + else + { + pg_popcount_optimized = pg_popcount_neon; + pg_popcount_masked_optimized = pg_popcount_masked_neon; + } +} + +static uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + choose_popcount_functions(); + return pg_popcount_optimized(buf, bytes); +} + +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked_optimized(buf, bytes, mask); +} + +/* + * pg_popcount_sve + * Returns number of 1 bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +static uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + uint32 vec_len = svcntb(), + bytes_per_iteration = 4 * vec_len; + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0), + accum3 = svdup_u64(0), + accum4 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64 popcnt = 0; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + svuint64_t vec; + + vec = svld1(pred, (const uint64 *) buf); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svld1(pred, (const uint64 *) buf); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svld1(pred, (const uint64 *) buf); + accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svld1(pred, (const uint64 *) buf); + accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec)); + buf += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + popcnt += svaddv(pred, svadd_x(pred, accum3, accum4)); + + /* + * Process any remaining data. + */ + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svld1(pred, (const uint8 *) buf); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + buf += vec_len; + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns number of 1 bits in buf after applying the mask to each byte + */ +pg_attribute_target("arch=armv8-a+sve") +static uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + uint32 vec_len = svcntb(), + bytes_per_iteration = 4 * vec_len; + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0), + accum3 = svdup_u64(0), + accum4 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec)); + buf += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + popcnt += svaddv(pred, svadd_x(pred, accum3, accum4)); + + /* + * Process any remaining data. + */ + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8 *) buf), mask); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + buf += vec_len; + } + + return popcnt; +} + +#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + +/* + * When the SVE version isn't available, there's no point in using function + * pointers to vary the implementation. We instead just make these actual + * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined. + * The compiler should be able to inline the slow versions here. + */ +uint64 +pg_popcount_optimized(const char *buf, int bytes) +{ + return pg_popcount_neon(buf, bytes); +} + +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + return pg_popcount_masked_neon(buf, bytes, mask); +} + +#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + /* * pg_popcount32 * Return number of 1 bits in word @@ -39,11 +262,11 @@ pg_popcount64(uint64 word) } /* - * pg_popcount_optimized + * pg_popcount_neon * Returns number of 1 bits in buf */ -uint64 -pg_popcount_optimized(const char *buf, int bytes) +static uint64 +pg_popcount_neon(const char *buf, int bytes) { uint8x16_t vec; uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); @@ -119,11 +342,11 @@ pg_popcount_optimized(const char *buf, int bytes) } /* - * pg_popcount_masked_optimized + * pg_popcount_masked_neon * Returns number of 1 bits in buf after applying the mask to each byte */ -uint64 -pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +static uint64 +pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) { uint8x16_t vec; uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); -- 2.39.5 (Apple Git-154)