From f08e15c0834616c636d1cb949ed140926265847e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 21 Nov 2024 12:42:09 -0800 Subject: [PATCH v10 3/4] Add AVX-512 CRC32C algorithm with a runtime check Adds pg_crc32c_avx512(): compute the crc32c of the buffer, where the buffer length must be at least 256, and a multiple of 64. Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009" Benchmark numbers to compare against the SSE4.2 CRC32C algorithm was generated by using the drive_crc32c() function added in src/test/modules/test_crc32c/test_crc32c.c. +------------------+----------------+----------------+------------------+-------+------+ | Rate in bytes/us | SDP (SPR) | m6i | m7i | | | +------------------+----------------+----------------+------------------+ Multi-| | | higher is better | SSE42 | AVX512 | SSE42 | AVX512 | SSE42 | AVX512 | plier | % | +==================+=================+=======+========+========+========+=======+======+ | AVG Rate 64-8192 | 10,095 | 82,101 | 8,591 | 38,652 | 11,867 | 83,194 | 6.68 | 568% | +------------------+--------+--------+-------+--------+--------+--------+-------+------+ | AVG Rate 64-255 | 9,034 | 9,136 | 7,619 | 7,437 | 9,030 | 9,293 | 1.01 | 1% | +------------------+--------+--------+-------+--------+--------+--------+-------+------+ Co-authored-by: Paul Amonson --- config/c-compiler.m4 | 32 +++++ configure | 154 ++++++++++++--------- configure.ac | 107 +++++++-------- meson.build | 23 ++++ src/include/pg_config.h.in | 3 + src/include/pg_cpu.h | 23 ++++ src/include/port/pg_crc32c.h | 55 +++----- src/include/port/pg_hw_feat_check.h | 6 + src/port/meson.build | 10 +- src/port/pg_crc32c_avx512.c | 203 ++++++++++++++++++++++++++++ src/port/pg_crc32c_sse42.c | 2 + src/port/pg_crc32c_sse42_choose.c | 51 ------- src/port/pg_crc32c_x86_choose.c | 57 ++++++++ src/port/pg_hw_feat_check.c | 75 +++++++++- 14 files changed, 578 insertions(+), 223 deletions(-) create mode 100644 src/include/pg_cpu.h create mode 100644 src/port/pg_crc32c_avx512.c delete mode 100644 src/port/pg_crc32c_sse42_choose.c create mode 100644 src/port/pg_crc32c_x86_choose.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index e112fd45d4..e08de01739 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -578,6 +578,38 @@ undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX512_CRC32_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86 CRC instructions added in AVX-512, +# using intrinsics with function __attribute__((target("..."))): + +AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics])])dnl +AC_CACHE_CHECK([for _mm512_clmulepi64_epi128 with function attribute], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx512vl,vpclmulqdq"))) + #endif + static int crc32_avx512_test(void) + { + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + }], + [return crc32_avx512_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx512_crc32_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_CRC32_INTRINSICS + + # PGAC_ARMV8_CRC32C_INTRINSICS # ---------------------------- # Check if the compiler supports the CRC32C instructions using the __crc32cb, diff --git a/configure b/configure index 518c33b73a..b03b928bfd 100755 --- a/configure +++ b/configure @@ -17159,7 +17159,7 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 $as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; } @@ -17203,6 +17203,52 @@ if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then fi +# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with +# the __attribute__((target("avx512vl,vpclmulqdq"))). +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128 with function attribute" >&5 +$as_echo_n "checking for _mm512_clmulepi64_epi128 with function attribute... " >&6; } +if ${pgac_cv_avx512_crc32_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx512vl,vpclmulqdq"))) + #endif + static int crc32_avx512_test(void) + { + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + } +int +main () +{ +return crc32_avx512_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_crc32_intrinsics=yes +else + pgac_cv_avx512_crc32_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics" >&5 +$as_echo "$pgac_cv_avx512_crc32_intrinsics" >&6; } +if test x"$pgac_cv_avx512_crc32_intrinsics" = x"yes"; then + pgac_avx512_crc32_intrinsics=yes +fi + + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -17404,9 +17450,8 @@ fi # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# the SSE/AVX-512 intrinsics compile both implementations and select which one +# to use at runtime, depending runtime cpuid information. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if @@ -17423,95 +17468,80 @@ fi # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 - else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 - else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 - else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 - else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi - fi - fi - fi - fi -fi -# Set PG_CRC32C_OBJS appropriately depending on the selected implementation. { $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation to use" >&5 $as_echo_n "checking which CRC-32C implementation to use... " >&6; } -if test x"$USE_SSE42_CRC32C" = x"1"; then +if test x"$host_cpu" = x"x86_64"; then + #x86 only: + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_x86_choose.o" + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 -$as_echo "SSE 4.2" >&6; } -else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C baseline feature SSE 4.2" >&5 +$as_echo "CRC32C baseline feature SSE 4.2" >&6; } + else + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 -$as_echo "SSE 4.2 with runtime check" >&6; } - else - if test x"$USE_ARMV8_CRC32C" = x"1"; then + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C SSE42 with runtime check" >&5 +$as_echo "CRC32C SSE42 with runtime check" >&6; } + fi + fi + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + +$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + PG_CRC32C_OBJS+=" pg_crc32c_avx512.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C AVX-512 with runtime check" >&5 +$as_echo "CRC32C AVX-512 with runtime check" >&6; } + fi +else + # non x86 code: + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 $as_echo "ARMv8 CRC instructions" >&6; } - else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + else + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } - else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + else + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 $as_echo "LoongArch CRCC instructions" >&6; } - else + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } - fi fi fi fi fi - # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then if test x"$PREFERRED_SEMAPHORES" = x"NAMED_POSIX" ; then diff --git a/configure.ac b/configure.ac index 247ae97fa4..96a9c2db1f 100644 --- a/configure.ac +++ b/configure.ac @@ -2021,10 +2021,14 @@ if test x"$host_cpu" = x"x86_64"; then fi fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() +# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with +# the __attribute__((target("avx512vl,vpclmulqdq"))). +PGAC_AVX512_CRC32_INTRINSICS([]) + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ @@ -2060,9 +2064,8 @@ AC_SUBST(CFLAGS_CRC) # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# the SSE/AVX-512 intrinsics compile both implementations and select which one +# to use at runtime, depending runtime cpuid information. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if @@ -2079,76 +2082,58 @@ AC_SUBST(CFLAGS_CRC) # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 - else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 + +AC_MSG_CHECKING([which CRC-32C implementation to use]) +if test x"$host_cpu" = x"x86_64"; then + #x86 only: + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_x86_choose.o" + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + AC_MSG_RESULT(CRC32C baseline feature SSE 4.2) else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 - else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 - else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + AC_MSG_RESULT(CRC32C SSE42 with runtime check) fi - fi fi - fi -fi - -# Set PG_CRC32C_OBJS appropriately depending on the selected implementation. -AC_MSG_CHECKING([which CRC-32C implementation to use]) -if test x"$USE_SSE42_CRC32C" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o" - AC_MSG_RESULT(SSE 4.2) + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel AVX 512 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS+=" pg_crc32c_avx512.o" + AC_MSG_RESULT(CRC32C AVX-512 with runtime check) + fi else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - AC_MSG_RESULT(SSE 4.2 with runtime check) + # non x86 code: + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + AC_MSG_RESULT(ARMv8 CRC instructions) else - if test x"$USE_ARMV8_CRC32C" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - AC_MSG_RESULT(ARMv8 CRC instructions) + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then + AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then - AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - AC_MSG_RESULT(LoongArch CRCC instructions) - else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) - fi + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) fi fi fi fi AC_SUBST(PG_CRC32C_OBJS) - # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then if test x"$PREFERRED_SEMAPHORES" = x"NAMED_POSIX" ; then diff --git a/meson.build b/meson.build index e5ce437a5c..5833661d71 100644 --- a/meson.build +++ b/meson.build @@ -2222,6 +2222,23 @@ if host_cpu == 'x86' or host_cpu == 'x86_64' have_optimized_crc = true else + avx512_crc_prog = ''' +#include +#include +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("avx512vl,vpclmulqdq"))) +#endif +int main(void) +{ + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); +} +''' + prog = ''' #include @@ -2252,6 +2269,12 @@ int main(void) cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true endif + if cc.links(avx512_crc_prog, + name: 'AVX512 CRC32C with function attributes', + args: test_c_args) + cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1) + have_optimized_crc = true + endif endif diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798ab..db40e6476d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -697,6 +697,9 @@ /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */ #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use Intel AVX-512 CRC instructions with a runtime check. */ +#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + /* Define to build with systemd support. (--with-systemd) */ #undef USE_SYSTEMD diff --git a/src/include/pg_cpu.h b/src/include/pg_cpu.h new file mode 100644 index 0000000000..223994cb0d --- /dev/null +++ b/src/include/pg_cpu.h @@ -0,0 +1,23 @@ +/* + * pg_cpu.h + * Useful macros to determine CPU types + */ + +#ifndef PG_CPU_H_ +#define PG_CPU_H_ +#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) + /* + * __i386__ is defined by gcc and Intel compiler on Linux, + * _M_IX86 by VS compiler, + * i386 by Sun compilers on opensolaris at least + */ + #define PG_CPU_X86 +#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) + /* + * both __x86_64__ and __amd64__ are defined by gcc + * __x86_64 defined by sun compiler on opensolaris at least + * _M_AMD64 defined by MS compiler + */ + #define PG_CPU_x86_64 +#endif +#endif // PG_CPU_H_ diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 63c8e3a00b..690273506b 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -34,58 +34,43 @@ #define PG_CRC32C_H #include "port/pg_bswap.h" +#include "pg_cpu.h" typedef uint32 pg_crc32c; /* The INIT and EQ macros are the same for all implementations. */ #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) - -#if defined(USE_SSE42_CRC32C) -/* Use Intel SSE4.2 instructions. */ -#define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_sse42((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +/* x86 */ +#if defined(PG_CPU_X86) || defined(PG_CPU_x86_64) +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c((crc), (data), (len))) +/* ARMV8 */ #elif defined(USE_ARMV8_CRC32C) -/* Use ARMv8 CRC Extension instructions. */ - +extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) -#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +/* ARMV8 with runtime check */ +#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c((crc), (data), (len))) +/* LoongArch */ #elif defined(USE_LOONGARCH_CRC32C) -/* Use LoongArch CRCC instructions. */ - +extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len))) -#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) - -extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); - -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) - -/* - * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first - * to check that they are available. - */ -#define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c((crc), (data), (len))) -#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) - -extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); -extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); - -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif #else /* @@ -98,13 +83,11 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c_sb8((crc), (data), (len))) #ifdef WORDS_BIGENDIAN +#undef FIN_CRC32C #define FIN_CRC32C(crc) ((crc) = pg_bswap32(crc) ^ 0xFFFFFFFF) -#else -#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) #endif extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); - #endif #endif /* PG_CRC32C_H */ diff --git a/src/include/port/pg_hw_feat_check.h b/src/include/port/pg_hw_feat_check.h index 58be900b54..3a73014987 100644 --- a/src/include/port/pg_hw_feat_check.h +++ b/src/include/port/pg_hw_feat_check.h @@ -30,4 +30,10 @@ extern PGDLLIMPORT bool pg_popcount_available(void); * available. */ extern PGDLLIMPORT bool pg_popcount_avx512_available(void); + +/* + * Test to see if all hardware features required by the AVX-512 SIMD + * algorithm are available. + */ +extern PGDLLIMPORT bool pg_crc32c_avx512_available(void); #endif /* PG_HW_FEAT_CHECK_H */ diff --git a/src/port/meson.build b/src/port/meson.build index ec28590473..0ba4a56194 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,8 +8,10 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_popcount_avx512.c', - 'pg_crc32c_sse42_choose.c', + 'pg_crc32c_x86_choose.c', + 'pg_crc32c_avx512.c', 'pg_crc32c_sse42.c', + 'pg_crc32c_sb8.c', 'pg_hw_feat_check.c', 'pg_strong_random.c', 'pgcheckdir.c', @@ -83,12 +85,6 @@ endif # Replacement functionality to be built if corresponding configure symbol # is true replace_funcs_pos = [ - # x86/x64 - ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], - ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], diff --git a/src/port/pg_crc32c_avx512.c b/src/port/pg_crc32c_avx512.c new file mode 100644 index 0000000000..ba4defcefd --- /dev/null +++ b/src/port/pg_crc32c_avx512.c @@ -0,0 +1,203 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_avx512.c + * Compute CRC-32C checksum using Intel AVX-512 instructions. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/port/pg_crc32c_avx512.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK) + +#include + +#include "port/pg_crc32c.h" + + +/******************************************************************* + * pg_crc32c_avx512(): compute the crc32c of the buffer, where the + * buffer length must be at least 256, and a multiple of 64. Based + * on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ + * Instruction" + * V. Gopal, E. Ozturk, et al., 2009 + * + * For This Function: + * Copyright 2015 The Chromium Authors + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google LLC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +pg_attribute_no_sanitize_alignment() +pg_attribute_target("avx512vl,vpclmulqdq") +inline pg_crc32c +pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length) +{ + static const uint64 k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, + 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + static const uint64 k3k4[8] = { + 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, + 0x9e4addf8, 0x740eef02, 0x9e4addf8}; + static const uint64 k9k10[8] = { + 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, + 0x0d3b6092, 0x6992cea2, 0x0d3b6092}; + static const uint64 k1k4[8] = { + 0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, + 0x493c7d27, 0x00000000, 0x00000000}; + + const uint8 *input = (const uint8 *)data; + if (length >= 256) + { + uint64 val; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1, a2; + + /* + * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned + * to 32 bytes. + * >>> BEGIN + */ + + /* + * There's at least one block of 256. + */ + x1 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + x2 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + x3 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + + x0 = _mm512_load_si512((__m512i *)k1k2); + + input += 256; + length -= 256; + + /* + * Parallel fold blocks of 256, if any. + */ + while (length >= 256) + { + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); + + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); + + y5 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + y6 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96); + x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96); + x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96); + + input += 256; + length -= 256; + } + + /* + * Fold 256 bytes into 64 bytes. + */ + x0 = _mm512_load_si512((__m512i *)k9k10); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96); + + x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96); + + x0 = _mm512_load_si512((__m512i *)k3k4); + y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96); + + /* + * Single fold blocks of 64, if any. + */ + while (length >= 64) + { + x2 = _mm512_loadu_si512((__m512i *)input); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96); + + input += 64; + length -= 64; + } + + /* + * Fold 512-bits to 128-bits. + */ + x0 = _mm512_loadu_si512((__m512i *)k1k4); + + a2 = _mm512_extracti32x4_epi32(x1, 3); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96); + + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + x0 = _mm512_xor_epi64(x1, x0); + a1 = _mm512_extracti32x4_epi32(x0, 1); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + + /* + * Fold 128-bits to 32-bits. + */ + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + crc = (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + /* + * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned + * to 32 bytes. + * <<< END + ******************************************************************/ + } + + /* + * Finish any remaining bytes with legacy AVX algorithm. + */ + return pg_comp_crc32c_sse42(crc, input, length); +} +#endif // AVX512_CRC32 diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index dcc4904a82..90d155e804 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -14,6 +14,7 @@ */ #include "c.h" +#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) #include #include "port/pg_crc32c.h" @@ -68,3 +69,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } +#endif diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c deleted file mode 100644 index c659917af0..0000000000 --- a/src/port/pg_crc32c_sse42_choose.c +++ /dev/null @@ -1,51 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_crc32c_sse42_choose.c - * Choose between Intel SSE 4.2 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports Intel SSE - * 4.2. If it does, use the special SSE instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). - * - * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/port/pg_crc32c_sse42_choose.c - * - *------------------------------------------------------------------------- - */ - -#include "c.h" - -#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) -#ifdef HAVE__GET_CPUID -#include -#endif - -#ifdef HAVE__CPUID -#include -#endif - -#include "port/pg_crc32c.h" -#include "port/pg_hw_feat_check.h" - -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) -{ - if (pg_crc32c_sse42_available()) - pg_comp_crc32c = pg_comp_crc32c_sse42; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; - - return pg_comp_crc32c(crc, data, len); -} - -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; -#endif diff --git a/src/port/pg_crc32c_x86_choose.c b/src/port/pg_crc32c_x86_choose.c new file mode 100644 index 0000000000..3ce8be11a6 --- /dev/null +++ b/src/port/pg_crc32c_x86_choose.c @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_x86_choose.c + * Choose between Intel AVX-512, SSE 4.2 and software CRC-32C implementation. + * + * On first call, checks if the CPU we're running on supports Intel AVX-512. If + * it does, use the special SSE instructions for CRC-32C computation. + * Otherwise, fall back to the pure software implementation (slicing-by-8). + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_x86_choose.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" +#include "pg_cpu.h" + +#if defined(PG_CPU_X86) || defined(PG_CPU_x86_64) + +#include "port/pg_crc32c.h" +#include "port/pg_hw_feat_check.h" + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + * (1) set pg_comp_crc32c pointer and (2) return the computed crc value + */ +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +{ +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + if (pg_crc32c_avx512_available()) { + pg_comp_crc32c = pg_comp_crc32c_avx512; + return pg_comp_crc32c(crc, data, len); + } +#endif +#ifdef USE_SSE42_CRC32C + pg_comp_crc32c = pg_comp_crc32c_sse42; + return pg_comp_crc32c(crc, data, len); +#elif USE_SSE42_CRC32C_WITH_RUNTIME_CHECK + if (pg_crc32c_sse42_available()) { + pg_comp_crc32c = pg_comp_crc32c_sse42; + return pg_comp_crc32c(crc, data, len); + } +#endif + pg_comp_crc32c = pg_comp_crc32c_sb8; + return pg_comp_crc32c(crc, data, len); +} + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; + +#endif // x86/x86_64 diff --git a/src/port/pg_hw_feat_check.c b/src/port/pg_hw_feat_check.c index 260aa60502..b2872fa708 100644 --- a/src/port/pg_hw_feat_check.c +++ b/src/port/pg_hw_feat_check.c @@ -11,6 +11,9 @@ *------------------------------------------------------------------------- */ #include "c.h" +#include "pg_cpu.h" + +#if defined(PG_CPU_X86) || defined(PG_CPU_x86_64) #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include @@ -135,9 +138,60 @@ bool PGDLLIMPORT pg_popcount_available(void) return is_bit_set_in_exx(exx, ECX, 23); } +/* + * Check for CPU supprt for CPUIDEX: avx512-f + */ +inline static bool +avx512f_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuidex(7, 0, exx); + return is_bit_set_in_exx(exx, EBX, 16); /* avx512-f */ +} + +/* + * Check for CPU supprt for CPUIDEX: vpclmulqdq + */ +inline static bool +vpclmulqdq_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuidex(7, 0, exx); + return is_bit_set_in_exx(exx, ECX, 10); /* vpclmulqdq */ +} + +/* + * Check for CPU supprt for CPUIDEX: vpclmulqdq + */ +inline static bool +avx512vl_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuidex(7, 0, exx); + return is_bit_set_in_exx(exx, EBX, 31); /* avx512-vl */ +} + +/* + * Check for CPU supprt for CPUID: sse4.2 + */ +inline static bool +sse42_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuid(1, exx); + return is_bit_set_in_exx(exx, ECX, 20); /* sse4.2 */ +} + +/****************************************************************************/ +/* Public API */ +/****************************************************************************/ /* - * Returns true if the CPU supports the instructions required for the AVX-512 - * pg_popcount() implementation. + * Returns true if the CPU supports the instructions required for the + * AVX-512 pg_popcount() implementation. * * PA: The call to 'osxsave_available' MUST preceed the call to * 'zmm_regs_available' function per NB above. @@ -154,10 +208,19 @@ bool PGDLLIMPORT pg_popcount_avx512_available(void) */ bool PGDLLIMPORT pg_crc32c_sse42_available(void) { - exx_t exx[4] = {0, 0, 0, 0}; - - pg_getcpuid(1, exx); + return sse42_available(); +} - return is_bit_set_in_exx(exx, ECX, 20); +/* + * Returns true if the CPU supports the instructions required for the AVX-512 + * pg_crc32c implementation. + */ +bool PGDLLIMPORT +pg_crc32c_avx512_available(void) +{ + return sse42_available() && osxsave_available() && + avx512f_available() && vpclmulqdq_available() && + avx512vl_available() && zmm_regs_available(); } +#endif // #if defined(PG_CPU_X86) || defined(PG_CPU_x86_64) -- 2.34.1