From c97e25e56347c90f169a5ce069a9ea06c873915b Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 16 Oct 2024 15:57:55 -0500 Subject: [PATCH v1 1/1] use __attribute__((target(...))) for AVX-512 stuff --- config/c-compiler.m4 | 60 +++++----- configure | 163 ++++++--------------------- configure.ac | 17 +-- meson.build | 17 +-- src/Makefile.global.in | 5 - src/include/c.h | 10 ++ src/makefiles/meson.build | 4 +- src/port/Makefile | 12 +- src/port/meson.build | 7 +- src/port/pg_popcount_avx512.c | 86 +++++++++++++- src/port/pg_popcount_avx512_choose.c | 102 ----------------- 11 files changed, 171 insertions(+), 312 deletions(-) delete mode 100644 src/port/pg_popcount_avx512_choose.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 10f8c7bd0a..aa90f8ef33 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -700,20 +700,20 @@ undefine([Ac_cachevar])dnl # Check if the compiler supports the XSAVE instructions using the _xgetbv # intrinsic function. # -# An optional compiler flag can be passed as argument (e.g., -mxsave). If the -# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE. +# If the intrinsics are supported, sets pgac_xsave_intrinsics. AC_DEFUN([PGAC_XSAVE_INTRINSICS], -[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl -AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar], -[pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS $1" -AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [return _xgetbv(0) & 0xe0;])], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics])])dnl +AC_CACHE_CHECK([for _xgetbv], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + __attribute__((target("xsave"))) + static int xsave_test(void) + { + return _xgetbv(0) & 0xe0; + }], + [return xsave_test();])], [Ac_cachevar=yes], - [Ac_cachevar=no]) -CFLAGS="$pgac_save_CFLAGS"]) + [Ac_cachevar=no])]) if test x"$Ac_cachevar" = x"yes"; then - CFLAGS_XSAVE="$1" pgac_xsave_intrinsics=yes fi undefine([Ac_cachevar])dnl @@ -725,29 +725,27 @@ undefine([Ac_cachevar])dnl # _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64, # _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions. # -# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq -# -mavx512bw). If the intrinsics are supported, sets -# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT. +# If the intrinsics are supported, sets pgac_avx512_popcnt_intrinsics. AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], -[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl -AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], -[pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS $1" -AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [const char buf@<:@sizeof(__m512i)@:>@; - PG_INT64_TYPE popcnt = 0; - __m512i accum = _mm512_setzero_si512(); - const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf); - const __m512i cnt = _mm512_popcnt_epi64(val); - accum = _mm512_add_epi64(accum, cnt); - popcnt = _mm512_reduce_add_epi64(accum); - /* return computed value, to prevent the above being optimized away */ - return popcnt == 0;])], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + __attribute__((target("avx512vpopcntdq","avx512bw"))) + static int popcount_test(void) + { + const char buf@<:@sizeof(__m512i)@:>@; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + return (int) popcnt; + }], + [return popcount_test();])], [Ac_cachevar=yes], - [Ac_cachevar=no]) -CFLAGS="$pgac_save_CFLAGS"]) + [Ac_cachevar=no])]) if test x"$Ac_cachevar" = x"yes"; then - CFLAGS_POPCNT="$1" pgac_avx512_popcnt_intrinsics=yes fi undefine([Ac_cachevar])dnl diff --git a/configure b/configure index 3a577e463b..028f448a29 100755 --- a/configure +++ b/configure @@ -647,9 +647,6 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC -PG_POPCNT_OBJS -CFLAGS_POPCNT -CFLAGS_XSAVE LIBOBJS OPENSSL ZSTD @@ -17270,185 +17267,99 @@ fi # Check for XSAVE intrinsics # -CFLAGS_XSAVE="" -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5 -$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; } -if ${pgac_cv_xsave_intrinsics_+:} false; then : +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5 +$as_echo_n "checking for _xgetbv... " >&6; } +if ${pgac_cv_xsave_intrinsics+:} false; then : $as_echo_n "(cached) " >&6 else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS " -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -int -main () -{ -return _xgetbv(0) & 0xe0; - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_xsave_intrinsics_=yes -else - pgac_cv_xsave_intrinsics_=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5 -$as_echo "$pgac_cv_xsave_intrinsics_" >&6; } -if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then - CFLAGS_XSAVE="" - pgac_xsave_intrinsics=yes -fi - -if test x"$pgac_xsave_intrinsics" != x"yes"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5 -$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; } -if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then : - $as_echo_n "(cached) " >&6 -else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS -mxsave" -cat confdefs.h - <<_ACEOF >conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include + __attribute__((target("xsave"))) + static int xsave_test(void) + { + return _xgetbv(0) & 0xe0; + } int main () { -return _xgetbv(0) & 0xe0; +return xsave_test(); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_xsave_intrinsics__mxsave=yes + pgac_cv_xsave_intrinsics=yes else - pgac_cv_xsave_intrinsics__mxsave=no + pgac_cv_xsave_intrinsics=no fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5 -$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; } -if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then - CFLAGS_XSAVE="-mxsave" +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics" >&5 +$as_echo "$pgac_cv_xsave_intrinsics" >&6; } +if test x"$pgac_cv_xsave_intrinsics" = x"yes"; then pgac_xsave_intrinsics=yes fi -fi if test x"$pgac_xsave_intrinsics" = x"yes"; then $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h fi - # Check for AVX-512 popcount intrinsics # -CFLAGS_POPCNT="" -PG_POPCNT_OBJS="" if test x"$host_cpu" = x"x86_64"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 -$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } -if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics+:} false; then : $as_echo_n "(cached) " >&6 else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS " -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -int -main () -{ -const char buf[sizeof(__m512i)]; - PG_INT64_TYPE popcnt = 0; - __m512i accum = _mm512_setzero_si512(); - const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf); - const __m512i cnt = _mm512_popcnt_epi64(val); - accum = _mm512_add_epi64(accum, cnt); - popcnt = _mm512_reduce_add_epi64(accum); - /* return computed value, to prevent the above being optimized away */ - return popcnt == 0; - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_avx512_popcnt_intrinsics_=yes -else - pgac_cv_avx512_popcnt_intrinsics_=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 -$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } -if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then - CFLAGS_POPCNT="" - pgac_avx512_popcnt_intrinsics=yes -fi - - if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw" >&5 -$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw... " >&6; } -if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw+:} false; then : - $as_echo_n "(cached) " >&6 -else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512bw" -cat confdefs.h - <<_ACEOF >conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include + __attribute__((target("avx512vpopcntdq","avx512bw"))) + static int popcount_test(void) + { + const char buf[sizeof(__m512i)]; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + return (int) popcnt; + } int main () { -const char buf[sizeof(__m512i)]; - PG_INT64_TYPE popcnt = 0; - __m512i accum = _mm512_setzero_si512(); - const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf); - const __m512i cnt = _mm512_popcnt_epi64(val); - accum = _mm512_add_epi64(accum, cnt); - popcnt = _mm512_reduce_add_epi64(accum); - /* return computed value, to prevent the above being optimized away */ - return popcnt == 0; +return popcount_test(); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=yes + pgac_cv_avx512_popcnt_intrinsics=yes else - pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=no + pgac_cv_avx512_popcnt_intrinsics=no fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&5 -$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&6; } -if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" = x"yes"; then - CFLAGS_POPCNT="-mavx512vpopcntdq -mavx512bw" +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics" = x"yes"; then pgac_avx512_popcnt_intrinsics=yes fi - fi if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then - PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o" $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi - - # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 55f6c46d33..8bdb9c576a 100644 --- a/configure.ac +++ b/configure.ac @@ -2052,32 +2052,19 @@ fi # Check for XSAVE intrinsics # -CFLAGS_XSAVE="" -PGAC_XSAVE_INTRINSICS([]) -if test x"$pgac_xsave_intrinsics" != x"yes"; then - PGAC_XSAVE_INTRINSICS([-mxsave]) -fi +PGAC_XSAVE_INTRINSICS() if test x"$pgac_xsave_intrinsics" = x"yes"; then AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.]) fi -AC_SUBST(CFLAGS_XSAVE) # Check for AVX-512 popcount intrinsics # -CFLAGS_POPCNT="" -PG_POPCNT_OBJS="" if test x"$host_cpu" = x"x86_64"; then - PGAC_AVX512_POPCNT_INTRINSICS([]) - if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then - PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw]) - fi + PGAC_AVX512_POPCNT_INTRINSICS() if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then - PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o" AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 popcount instructions with a runtime check.]) fi fi -AC_SUBST(CFLAGS_POPCNT) -AC_SUBST(PG_POPCNT_OBJS) # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # diff --git a/meson.build b/meson.build index 58e67975e8..9492f9d52e 100644 --- a/meson.build +++ b/meson.build @@ -2157,25 +2157,20 @@ endforeach # Check for the availability of XSAVE intrinsics. ############################################################### -cflags_xsave = [] if host_cpu == 'x86' or host_cpu == 'x86_64' prog = ''' #include +__attribute__((target("xsave"))) int main(void) { return _xgetbv(0) & 0xe0; } ''' - if cc.links(prog, name: 'XSAVE intrinsics without -mxsave', - args: test_c_args) + if cc.links(prog, name: 'XSAVE intrinsics', args: test_c_args) cdata.set('HAVE_XSAVE_INTRINSICS', 1) - elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave', - args: test_c_args + ['-mxsave']) - cdata.set('HAVE_XSAVE_INTRINSICS', 1) - cflags_xsave += '-mxsave' endif endif @@ -2185,12 +2180,12 @@ endif # Check for the availability of AVX-512 popcount intrinsics. ############################################################### -cflags_popcnt = [] if host_cpu == 'x86_64' prog = ''' #include +__attribute__((target("avx512vpopcntdq","avx512bw"))) int main(void) { const char buf[sizeof(__m512i)]; @@ -2205,13 +2200,9 @@ int main(void) } ''' - if cc.links(prog, name: 'AVX-512 popcount without -mavx512vpopcntdq -mavx512bw', + if cc.links(prog, name: 'AVX-512 popcount', args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))]) cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) - elif cc.links(prog, name: 'AVX-512 popcount with -mavx512vpopcntdq -mavx512bw', - args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'] + ['-mavx512bw']) - cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) - cflags_popcnt += ['-mavx512vpopcntdq'] + ['-mavx512bw'] endif endif diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 42f50b4976..45696247e9 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -262,9 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ -CFLAGS_POPCNT = @CFLAGS_POPCNT@ CFLAGS_CRC = @CFLAGS_CRC@ -CFLAGS_XSAVE = @CFLAGS_XSAVE@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@ CXXFLAGS = @CXXFLAGS@ @@ -762,9 +760,6 @@ LIBOBJS = @LIBOBJS@ # files needed for the chosen CRC-32C implementation PG_CRC32C_OBJS = @PG_CRC32C_OBJS@ -# files needed for the chosen popcount implementation -PG_POPCNT_OBJS = @PG_POPCNT_OBJS@ - LIBS := -lpgcommon -lpgport $(LIBS) # to make ws2_32.lib the last library diff --git a/src/include/c.h b/src/include/c.h index 55dec71a6d..6f5ca25542 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -174,6 +174,16 @@ #define pg_attribute_nonnull(...) #endif +/* + * pg_attribute_target allows specifying different target options that the + * function should be compiled with (e.g., for using special CPU instructions). + */ +#if __has_attribute (target) +#define pg_attribute_target(...) __attribute__((target(__VA_ARGS__))) +#else +#define pg_attribute_target(...) +#endif + /* * Append PG_USED_FOR_ASSERTS_ONLY to definitions of variables that are only * used in assert-enabled builds, to avoid compiler warnings about unused diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 850e927584..479aa08420 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -102,10 +102,8 @@ pgxs_kv = { ' '.join(cflags_no_missing_var_decls), 'CFLAGS_CRC': ' '.join(cflags_crc), - 'CFLAGS_POPCNT': ' '.join(cflags_popcnt), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), - 'CFLAGS_XSAVE': ' '.join(cflags_xsave), 'LDFLAGS': var_ldflags, 'LDFLAGS_EX': var_ldflags_ex, @@ -181,7 +179,7 @@ pgxs_empty = [ 'WANTED_LANGUAGES', # Not needed because we don't build the server / PLs with the generated makefile - 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS', + 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS', 'DTRACEFLAGS', # only server has dtrace probes 'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp', diff --git a/src/port/Makefile b/src/port/Makefile index 9324ec2d9f..366c814bd9 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -38,13 +38,13 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ - $(PG_POPCNT_OBJS) \ bsearch_arg.o \ chklocale.o \ inet_net_ntop.o \ noblock.o \ path.o \ pg_bitutils.o \ + pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ @@ -92,16 +92,6 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) -# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE -pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE) -pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE) -pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE) - -# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT -pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT) -pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) -pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT) - # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 1150966ab7..83a0632520 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', @@ -84,8 +85,6 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'], - ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -100,8 +99,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave} -pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []} +pgport_cflags = {'crc': cflags_crc} +pgport_sources_cflags = {'crc': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index 9d3149e2d0..b598e86554 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -12,7 +12,17 @@ */ #include "c.h" +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#include +#endif + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK #include +#endif + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#include +#endif #include "port/pg_bitutils.h" @@ -21,12 +31,82 @@ * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on * the function pointers that are only used when TRY_POPCNT_FAST is set. */ -#ifdef TRY_POPCNT_FAST +#if defined(TRY_POPCNT_FAST) && defined(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK) + +/* + * Does CPUID say there's support for XSAVE instructions? + */ +static inline bool +xsave_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + return (exx[2] & (1 << 27)) != 0; /* osxsave */ +} + +/* + * Does XGETBV say the ZMM registers are enabled? + * + * NB: Caller is responsible for verifying that xsave_available() returns true + * before calling this. + */ +#ifdef HAVE_XSAVE_INTRINSICS +pg_attribute_target("xsave") +#endif +static inline bool +zmm_regs_available(void) +{ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe6) == 0xe6; +#else + return false; +#endif +} + +/* + * Does CPUID say there's support for AVX-512 popcount and byte-and-word + * instructions? + */ +static inline bool +avx512_popcnt_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */ + (exx[1] & (1 << 30)) != 0; /* avx512-bw */ +} + +/* + * Returns true if the CPU supports the instructions required for the AVX-512 + * pg_popcount() implementation. + */ +bool +pg_popcount_avx512_available(void) +{ + return xsave_available() && + zmm_regs_available() && + avx512_popcnt_available(); +} /* * pg_popcount_avx512 * Returns the number of 1-bits in buf */ +pg_attribute_target("avx512vpopcntdq", "avx512bw") uint64 pg_popcount_avx512(const char *buf, int bytes) { @@ -82,6 +162,7 @@ pg_popcount_avx512(const char *buf, int bytes) * pg_popcount_masked_avx512 * Returns the number of 1-bits in buf after applying the mask to each byte */ +pg_attribute_target("avx512vpopcntdq", "avx512bw") uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) { @@ -138,4 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) return _mm512_reduce_add_epi64(accum); } -#endif /* TRY_POPCNT_FAST */ +#endif /* TRY_POPCNT_FAST && + * USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c deleted file mode 100644 index b37107803a..0000000000 --- a/src/port/pg_popcount_avx512_choose.c +++ /dev/null @@ -1,102 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_popcount_avx512_choose.c - * Test whether we can use the AVX-512 pg_popcount() implementation. - * - * Copyright (c) 2024, PostgreSQL Global Development Group - * - * IDENTIFICATION - * src/port/pg_popcount_avx512_choose.c - * - *------------------------------------------------------------------------- - */ -#include "c.h" - -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) -#include -#endif - -#ifdef HAVE_XSAVE_INTRINSICS -#include -#endif - -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) -#include -#endif - -#include "port/pg_bitutils.h" - -/* - * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to - * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on - * the function pointers that are only used when TRY_POPCNT_FAST is set. - */ -#ifdef TRY_POPCNT_FAST - -/* - * Does CPUID say there's support for XSAVE instructions? - */ -static inline bool -xsave_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 27)) != 0; /* osxsave */ -} - -/* - * Does XGETBV say the ZMM registers are enabled? - * - * NB: Caller is responsible for verifying that xsave_available() returns true - * before calling this. - */ -static inline bool -zmm_regs_available(void) -{ -#ifdef HAVE_XSAVE_INTRINSICS - return (_xgetbv(0) & 0xe6) == 0xe6; -#else - return false; -#endif -} - -/* - * Does CPUID say there's support for AVX-512 popcount and byte-and-word - * instructions? - */ -static inline bool -avx512_popcnt_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */ - (exx[1] & (1 << 30)) != 0; /* avx512-bw */ -} - -/* - * Returns true if the CPU supports the instructions required for the AVX-512 - * pg_popcount() implementation. - */ -bool -pg_popcount_avx512_available(void) -{ - return xsave_available() && - zmm_regs_available() && - avx512_popcnt_available(); -} - -#endif /* TRY_POPCNT_FAST */ -- 2.39.5 (Apple Git-154)