From 0b51becb0505d5bde5f8e2acc90a7f4f4b604fe3 Mon Sep 17 00:00:00 2001 From: Rama Malladi Date: Wed, 27 Nov 2024 07:15:23 -0600 Subject: [PATCH] SVE popcount support --- config/c-compiler.m4 | 33 +++++++++++ configure | 104 +++++++++++++++++++++++++++++++++ configure.ac | 15 +++++ meson.build | 37 ++++++++++++ src/Makefile.global.in | 2 + src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 9 +++ src/port/Makefile | 6 ++ src/port/meson.build | 1 + src/port/pg_bitutils.c | 18 ++++++ src/port/pg_popcount_sve.c | 103 ++++++++++++++++++++++++++++++++ 11 files changed, 331 insertions(+) create mode 100644 src/port/pg_popcount_sve.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index a129edb88e..eee9720931 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -754,3 +754,36 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_SVE_POPCNT_INTRINSICS +# ---------------------------- +# Check if the compiler supports the SVE popcount instructions. +# +# An optional compiler flag can be passed as argument (e.g. +# -march=armv8-a+sve). If the intrinsics are supported, sets +# pgac_sve_popcnt_intrinsics, and CFLAGS_POPCNT. +AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for svcnt_u8_z with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [svuint8_t accum = svdup_u8(0); + svuint8_t buf = svdup_u8(0); + svbool_t pgTrue = svptrue_b8(); + uint64_t popcnt = 0; + + accum = svcnt_u8_z(pgTrue, buf); + popcnt = svaddv_u8(pgTrue, accum); + + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_POPCNT="$1" + pgac_sve_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_SVE_POPCNT_INTRINSICS diff --git a/configure b/configure index 199d666aa7..ab6092d3c5 100755 --- a/configure +++ b/configure @@ -646,7 +646,9 @@ MSGMERGE MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS +PG_POPCNT_OBJS CFLAGS_CRC +CFLAGS_POPCNT LIBOBJS OPENSSL ZSTD @@ -17653,6 +17655,108 @@ fi +# Check for SVE popcount intrinsics +# +# First check if svcnt_u8_z intrinsics can be used with the default compiler +# flags. If not, check if adding -march=armv8-a+sve flag helps. +# CFLAGS_POPCNT is set if the extra flag is required. +CFLAGS_POPCNT="" +PG_POPCNT_OBJS="" +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_u8_z with CFLAGS=" >&5 +$as_echo_n "checking for svcnt_u8_z with CFLAGS=... " >&6; } +if ${pgac_cv_sve_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +int main(void) +{ + svuint8_t accum = svdup_u8(0); + svuint8_t buf = svdup_u8(0); + svbool_t pgTrue = svptrue_b8(); + uint64_t popcnt = 0; + + accum = svcnt_u8_z(pgTrue, buf); + popcnt = svaddv_u8(pgTrue, accum); + + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_sve_popcnt_intrinsics_=yes +else + pgac_cv_sve_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_sve_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_sve_popcnt_intrinsics_" = x"yes"; then + CFLAGS_POPCNT="" + pgac_sve_popcnt_intrinsics=yes +fi + +if test x"$pgac_sve_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_u8_z with CFLAGS=-march=armv8-a+sve" >&5 +$as_echo_n "checking for svcnt_u8_z with CFLAGS=-march=armv8-a+sve... " >&6; } +if ${pgac_cv_sve_popcnt_intrinsics__march_armv8_apsve+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -march=armv8-a+sve" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +int main(void) +{ + svuint8_t accum = svdup_u8(0); + svuint8_t buf = svdup_u8(0); + svbool_t pgTrue = svptrue_b8(); + uint64_t popcnt = 0; + + accum = svcnt_u8_z(pgTrue, buf); + popcnt = svaddv_u8(pgTrue, accum); + + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_sve_popcnt_intrinsics__march_armv8_apsve=yes +else + pgac_cv_sve_popcnt_intrinsics__march_armv8_apsve=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics__march_armv8_apsve" >&5 +$as_echo "$pgac_cv_sve_popcnt_intrinsics__march_armv8_apsve" >&6; } +if test x"$pgac_cv_sve_popcnt_intrinsics__march_armv8_apsve" = x"yes"; then + CFLAGS_POPCNT="-march=armv8-a+sve" + pgac_sve_popcnt_intrinsics=yes +fi + +if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then + PG_POPCNT_OBJS="pg_popcount_sve.o" + +$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + +fi +fi + + + + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can diff --git a/configure.ac b/configure.ac index 4f56bb5062..3c0b8ffdbe 100644 --- a/configure.ac +++ b/configure.ac @@ -2107,6 +2107,21 @@ PGAC_LOONGARCH_CRC32C_INTRINSICS() AC_SUBST(CFLAGS_CRC) +# Check for ARMv8 SVE popcount intrinsics +# +CFLAGS_POPCNT="" +PG_POPCNT_OBJS="" +PGAC_SVE_POPCNT_INTRINSICS([]) +if test x"$pgac_sve_popcnt_intrinsics" != x"yes"; then + PGAC_SVE_POPCNT_INTRINSICS([-march=armv8-a+sve]) +fi +if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then + PG_POPCNT_OBJS="pg_popcount_sve.o" + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE popcount instructions with a runtime check.]) +fi +AC_SUBST(CFLAGS_POPCNT) +AC_SUBST(PG_POPCNT_OBJS) + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can diff --git a/meson.build b/meson.build index 83e61d0f4a..7c927883af 100644 --- a/meson.build +++ b/meson.build @@ -2205,6 +2205,43 @@ int main(void) endif +############################################################### +# Check for the availability of SVE popcount intrinsics. +############################################################### +cflags_popcnt = [] +if host_cpu == 'arm' or host_cpu == 'aarch64' + + prog = ''' +#include + +int main(void) +{ + svuint8_t accum = svdup_u8(0); + svuint8_t buf = svdup_u8(0); + svbool_t pgTrue = svptrue_b8(); + uint64_t popcnt = 0; + + accum = svcnt_u8_z(pgTrue, buf); + popcnt = svaddv_u8(pgTrue, accum); + + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +''' + + if cc.links(prog, name: 'SVE popcount without -march=armv8-a+sve', + args: test_c_args) + # Use ARM POPCNT Extension, with runtime check + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + elif cc.links(prog, name: 'SVE popcount with -march=armv8-a+sve', + args: test_c_args + ['-march=armv8-a+sve']) + # Use ARM POPCNT Extension, with runtime check + cflags_popcnt += ['-march=armv8-a+sve'] + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 0f38d712d1..523072a0db 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_POPCNT = @CFLAGS_POPCNT@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@ CXXFLAGS = @CXXFLAGS@ @@ -769,6 +770,7 @@ LIBOBJS = @LIBOBJS@ # files needed for the chosen CRC-32C implementation PG_CRC32C_OBJS = @PG_CRC32C_OBJS@ +PG_POPCNT_OBJS = @PG_POPCNT_OBJS@ LIBS := -lpgcommon -lpgport $(LIBS) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 40e4b2e381..6baec9549a 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -666,6 +666,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 4d88478c9c..d6a9aee00e 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -321,11 +321,20 @@ extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +#else extern uint64 pg_popcount_optimized(const char *buf, int bytes); +#endif extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); #endif /* TRY_POPCNT_FAST */ +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern uint64 pg_popcount_sve(const char *buf, int bytes); +extern int check_sve_support(void); +#endif + /* * Returns the number of 1-bits in buf. * diff --git a/src/port/Makefile b/src/port/Makefile index 366c814bd9..7ecf776069 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ + $(PG_POPCNT_OBJS) \ bsearch_arg.o \ chklocale.o \ inet_net_ntop.o \ @@ -92,6 +93,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# pg_popcount_sve.o and its _srv.o version need CFLAGS_POPCNT +pg_popcount_sve.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_sve_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_sve_srv.o: CFLAGS+=$(CFLAGS_POPCNT) + # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 83a0632520..7af85c8111 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -91,6 +91,7 @@ replace_funcs_pos = [ ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcount_sve', 'USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'], # loongarch ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 87f56e82b8..168bf24635 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -125,6 +125,22 @@ uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choo uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; #endif /* TRY_POPCNT_FAST */ +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +static uint64 pg_popcount_choose(const char *buf, int bytes); +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; + +static inline uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + if (check_sve_support()) + pg_popcount_optimized = pg_popcount_sve; + else + pg_popcount_optimized = pg_popcount_slow; + return pg_popcount_optimized(buf, bytes); +} + +#endif /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + #ifdef TRY_POPCNT_FAST /* @@ -507,6 +523,7 @@ pg_popcount64(uint64 word) return pg_popcount64_slow(word); } +#ifndef USE_SVE_POPCNT_WITH_RUNTIME_CHECK /* * pg_popcount_optimized * Returns the number of 1-bits in buf @@ -516,6 +533,7 @@ pg_popcount_optimized(const char *buf, int bytes) { return pg_popcount_slow(buf, bytes); } +#endif /* * pg_popcount_masked_optimized diff --git a/src/port/pg_popcount_sve.c b/src/port/pg_popcount_sve.c new file mode 100644 index 0000000000..04a08fbcc3 --- /dev/null +++ b/src/port/pg_popcount_sve.c @@ -0,0 +1,103 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_sve.c + * pg_popcount() using SVE population count instruction + * + * IDENTIFICATION + * src/port/pg_popcount_sve.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +#include + +#include "port/pg_bitutils.h" + +// check if sve supported +int check_sve_support(void) +{ + // Read ID_AA64PFR0_EL1 register + uint64_t pfr0; + __asm__ __volatile__( + "mrs %0, ID_AA64PFR0_EL1" + : "=r" (pfr0)); + + // SVE bits are 32-35 + return (pfr0 >> 32) & 0xf; +} + +/* + * pg_popcount_sve + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + svuint8_t cnt8 = svdup_u8(0); + svuint8_t accum8 = svdup_u8(0), val8; + svbool_t pg8True = svptrue_b8(), pg64True, pg; + svuint64_t cnt64, accum64, accum64_1, val64; + + int64_t popcount = 0; + const char *aligned_buf, *epilogue_buf; + int i, prologue_loop_bytes, kernel_loop_sve_cnt, epilogue_loop_bytes; + + // for small buffer sizes (<= 128-bytes), execute 1-byte SVE instructions + // for larger buffer sizes (> 128-bytes), execute 1-byte + 8-byte SVE instructions + if (bytes <= 128) + prologue_loop_bytes = bytes; + else + { + aligned_buf = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf) + sizeof(uint64_t); + prologue_loop_bytes = aligned_buf - buf; + } + + for (i = 0; i < prologue_loop_bytes; i += svcntb()) + { + pg = svwhilelt_b8(i, prologue_loop_bytes); + val8 = svld1_u8(pg, (uint8_t*)(buf + i)); + cnt8 = svcnt_u8_x(pg, val8); + popcount += svaddv_u8(pg, cnt8); + } + + if (bytes > 128) + { + cnt64 = svdup_u64(0); + accum64 = svdup_u64(0); + accum64_1 = svdup_u64(0); + pg64True = svptrue_b64(); + + kernel_loop_sve_cnt = ((bytes - prologue_loop_bytes) / 8) / svcntd() / 2; + epilogue_loop_bytes = bytes - prologue_loop_bytes - (kernel_loop_sve_cnt * 8 * svcntd() * 2); + epilogue_buf = (const char *) buf + prologue_loop_bytes + (kernel_loop_sve_cnt * 8 * svcntd() * 2); + + /* loop unroll by 2 */ + for (i = 0; i < kernel_loop_sve_cnt * 2 * (int)svcntd(); i += svcntd() * 2) + { + cnt64 = svld1_u64(pg64True, (uint64_t*)(aligned_buf + sizeof(uint64_t) * i)); + val64 = svcnt_u64_m(cnt64, pg64True, cnt64); + accum64 = svadd_u64_x(pg64True, val64, accum64); + cnt64 = svld1_u64(pg64True, (uint64_t*)(aligned_buf + sizeof(uint64_t) * (i + svcntd()))); + val64 = svcnt_u64_m(cnt64, pg64True, cnt64); + accum64_1 = svadd_u64_x(pg64True, val64, accum64_1); + } + popcount += svaddv_u64(pg64True, accum64); + popcount += svaddv_u64(pg64True, accum64_1); + + accum8 = svdup_u8(0); + for (i = 0; i < epilogue_loop_bytes; i += svcntb()) + { + pg = svwhilelt_b8(i, epilogue_loop_bytes); + val8 = svld1_u8(pg, (uint8_t*)(epilogue_buf + i)); + cnt8 = svcnt_u8_z(pg, val8); + accum8 = svadd_u8_m(pg8True, cnt8, accum8); + } + popcount += svaddv_u8(pg8True, accum8); + } + + return popcount; +} +#endif /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -- 2.45.1