From c405f5a41cca9ce5de9c0e69e00d11b16ba93986 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 26 Nov 2024 14:17:03 -0600 Subject: [PATCH v6 1/1] Use __attribute__((target(...))) for SSE4.2 support. Author: Raghuveer Devulapalli Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com --- config/c-compiler.m4 | 32 +++++++------ configure | 93 ++++++++++++-------------------------- configure.ac | 19 ++++---- meson.build | 22 ++++++--- src/port/Makefile | 5 -- src/port/meson.build | 2 +- src/port/pg_crc32c_sse42.c | 1 + 7 files changed, 72 insertions(+), 102 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index a129edb88e..309d5b04b4 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -605,24 +605,26 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if # the other ones are, on x86-64 platforms) # -# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the -# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC. +# If the intrinsics are supported, sets pgac_sse42_crc32_intrinsics. AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS], -[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl -AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar], -[pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS $1" -AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [unsigned int crc = 0; - crc = _mm_crc32_u8(crc, 0); - crc = _mm_crc32_u32(crc, 0); - /* return computed value, to prevent the above being optimized away */ - return crc == 0;])], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl +AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("sse4.2"))) + #endif + static int crc32_sse42_test(void) + { + unsigned int crc = 0; + crc = _mm_crc32_u8(crc, 0); + crc = _mm_crc32_u32(crc, 0); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + }], + [return crc32_sse42_test();])], [Ac_cachevar=yes], - [Ac_cachevar=no]) -CFLAGS="$pgac_save_CFLAGS"]) + [Ac_cachevar=no])]) if test x"$Ac_cachevar" = x"yes"; then - CFLAGS_CRC="$1" pgac_sse42_crc32_intrinsics=yes fi undefine([Ac_cachevar])dnl diff --git a/configure b/configure index 199d666aa7..cb89866649 100755 --- a/configure +++ b/configure @@ -17375,87 +17375,47 @@ fi # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # -# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used -# with the default compiler flags. If not, check if adding the -msse4.2 -# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5 -$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; } -if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then : +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 +$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; } +if ${pgac_cv_sse42_crc32_intrinsics+:} false; then : $as_echo_n "(cached) " >&6 else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS " -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -int -main () -{ -unsigned int crc = 0; - crc = _mm_crc32_u8(crc, 0); - crc = _mm_crc32_u32(crc, 0); - /* return computed value, to prevent the above being optimized away */ - return crc == 0; - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_sse42_crc32_intrinsics_=yes -else - pgac_cv_sse42_crc32_intrinsics_=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5 -$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; } -if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then - CFLAGS_CRC="" - pgac_sse42_crc32_intrinsics=yes -fi - -if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5 -$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; } -if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then : - $as_echo_n "(cached) " >&6 -else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS -msse4.2" -cat confdefs.h - <<_ACEOF >conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("sse4.2"))) + #endif + static int crc32_sse42_test(void) + { + unsigned int crc = 0; + crc = _mm_crc32_u8(crc, 0); + crc = _mm_crc32_u32(crc, 0); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + } int main () { -unsigned int crc = 0; - crc = _mm_crc32_u8(crc, 0); - crc = _mm_crc32_u32(crc, 0); - /* return computed value, to prevent the above being optimized away */ - return crc == 0; +return crc32_sse42_test(); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_sse42_crc32_intrinsics__msse4_2=yes + pgac_cv_sse42_crc32_intrinsics=yes else - pgac_cv_sse42_crc32_intrinsics__msse4_2=no + pgac_cv_sse42_crc32_intrinsics=no fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5 -$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; } -if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then - CFLAGS_CRC="-msse4.2" +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5 +$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; } +if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then pgac_sse42_crc32_intrinsics=yes fi -fi # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. @@ -17658,15 +17618,20 @@ fi # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, perhaps with some extra CFLAGS, compile both -# implementations and select which one to use at runtime, depending on whether -# SSE 4.2 is supported by the processor we're running on. +# the SSE intrinsics, compile both implementations and select which one to use +# at runtime, depending on whether SSE 4.2 is supported by the processor we're +# running on. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if # we're not targeting such a processor, but can nevertheless produce code that # uses the CRC instructions, compile both, and select at runtime. # +# Note that we do not use __attribute__((target("..."))) for the ARM CRC +# instructions because until Clang 16.0.0, using the intrinsics still requires +# special -march flags. Perhaps we can re-evaluate this decision after some +# time has passed. +# # You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. # diff --git a/configure.ac b/configure.ac index 4f56bb5062..5785600faf 100644 --- a/configure.ac +++ b/configure.ac @@ -2069,13 +2069,7 @@ fi # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # -# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used -# with the default compiler flags. If not, check if adding the -msse4.2 -# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required. -PGAC_SSE42_CRC32_INTRINSICS([]) -if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - PGAC_SSE42_CRC32_INTRINSICS([-msse4.2]) -fi +PGAC_SSE42_CRC32_INTRINSICS() # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. @@ -2112,15 +2106,20 @@ AC_SUBST(CFLAGS_CRC) # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, perhaps with some extra CFLAGS, compile both -# implementations and select which one to use at runtime, depending on whether -# SSE 4.2 is supported by the processor we're running on. +# the SSE intrinsics, compile both implementations and select which one to use +# at runtime, depending on whether SSE 4.2 is supported by the processor we're +# running on. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if # we're not targeting such a processor, but can nevertheless produce code that # uses the CRC instructions, compile both, and select at runtime. # +# Note that we do not use __attribute__((target("..."))) for the ARM CRC +# instructions because until Clang 16.0.0, using the intrinsics still requires +# special -march flags. Perhaps we can re-evaluate this decision after some +# time has passed. +# # You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. # diff --git a/meson.build b/meson.build index 83e61d0f4a..d6dc8c8525 100644 --- a/meson.build +++ b/meson.build @@ -2211,14 +2211,19 @@ endif # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, perhaps with some extra CFLAGS, compile both -# implementations and select which one to use at runtime, depending on whether -# SSE 4.2 is supported by the processor we're running on. +# the SSE intrinsics, compile both implementations and select which one to use +# at runtime, depending on whether SSE 4.2 is supported by the processor we're +# running on. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if # we're not targeting such a processor, but can nevertheless produce code that # uses the CRC instructions, compile both, and select at runtime. +# +# Note that we do not use __attribute__((target("..."))) for the ARM CRC +# instructions because until Clang 16.0.0, using the intrinsics still requires +# special -march flags. Perhaps we can re-evaluate this decision after some +# time has passed. ############################################################### have_optimized_crc = false @@ -2234,6 +2239,9 @@ if host_cpu == 'x86' or host_cpu == 'x86_64' prog = ''' #include +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("sse4.2"))) +#endif int main(void) { unsigned int crc = 0; @@ -2244,16 +2252,16 @@ int main(void) } ''' - if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2', + if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32', args: test_c_args) + # Do not use Intel SSE 4.2 + elif (cc.get_define('__SSE4_2__') != '') # Use Intel SSE 4.2 unconditionally. cdata.set('USE_SSE42_CRC32C', 1) have_optimized_crc = true - elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2', - args: test_c_args + ['-msse4.2']) + else # Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for # the runtime check. - cflags_crc += '-msse4.2' cdata.set('USE_SSE42_CRC32C', false) cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true diff --git a/src/port/Makefile b/src/port/Makefile index 366c814bd9..4c22431951 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -82,11 +82,6 @@ libpgport.a: $(OBJS) rm -f $@ $(AR) $(AROPT) $@ $^ -# all versions of pg_crc32c_sse42.o need CFLAGS_CRC -pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) -pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) -pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) - # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) diff --git a/src/port/meson.build b/src/port/meson.build index 83a0632520..c5bceed9cd 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -82,7 +82,7 @@ endif replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], - ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 7f88c11480..dcc4904a82 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -19,6 +19,7 @@ #include "port/pg_crc32c.h" pg_attribute_no_sanitize_alignment() +pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { -- 2.39.5 (Apple Git-154)