From 68ee8bf34c80a0a3df02c2aae8357f664895b4de Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 10:55:50 -0500
Subject: [PATCH v4 2/3] pg_lfind32(): Further optimize processing remaining
 elements.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index bef0e2d5be..83fb8f50d2 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -96,8 +96,8 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
 	 * on a block of registers.  We first do as much processing as possible
-	 * with a block of 4 registers, then we try to process what remains with a
-	 * block of 2 registers.
+	 * with a block of 4 registers, then we process what remains with a block
+	 * of 2 registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
@@ -120,6 +120,15 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
+	/*
+	 * If there aren't enough elements for the SIMD optimizations, jump
+	 * straight to the standard one-by-one linear search code.  Testing has
+	 * shown that the gains of skipping to the standard linear search code are
+	 * worth the extra check.
+	 */
+	if (nelem < nelem_per_vector * 2)
+		goto slow_path;
+
 	for (i = 0; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -165,6 +174,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	nelem_per_iteration = 2 * nelem_per_vector;
 	tail_idx = nelem & ~(nelem_per_iteration - 1);
 
+retry:
 	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -191,8 +201,25 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	/*
+	 * Process the remaining elements via the 2-register loop above.  This
+	 * will cause us to process some elements more than once, but that won't
+	 * affect correctness, and testing shows that this approach helps more
+	 * than it harms.
+	 */
+	if (i != nelem)
+	{
+		tail_idx = nelem;
+		i = tail_idx - nelem_per_iteration;
+		goto retry;
+	}
+
+	Assert(!assert_result);
+	return false;
 #endif							/* ! USE_NO_SIMD */
 
+slow_path:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1