From 67dbf92aceaa506746c66276354ab430a85f06c7 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 12 Apr 2024 12:10:20 -0700
Subject: [PATCH v1 2/2] WIP: Avoid spuriously triggering the stuck spinlock
 detection

When signals are delivered, the sleep in perform_spin_delay() is
interrupted. Until now, perform_spin_delay() did not take that into account,
and behaved as if the sleep was uninterrupted. In some workloads signals can
be frequent enough to trigger the old logic almost immediately.

As this needs to be fixed in the backbranches, we are somewhat constrained by
ABI compatibility. Otherwise we e.g. could track the actual elapsed time in
SpinDelayStatus.

Instead, check if errno == EINTR after pg_usleep(), and only increase
SpinDelayStatus->delays if the sleep was not interrupted. Both the normal and
the windows implementation of pg_usleep() set errno == EINTR when interrupted.

This can substantially increase the amount of time until a stuck spinlock is
detected. While not great, it's much better than spuriously crash-restarting.
---
 src/port/pgsleep.c                |  3 +++
 src/backend/storage/lmgr/s_lock.c | 12 +++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/port/pgsleep.c b/src/port/pgsleep.c
index 1284458bfce..1f87ef03490 100644
--- a/src/port/pgsleep.c
+++ b/src/port/pgsleep.c
@@ -36,6 +36,8 @@
  * might happen to run before the sleep begins, allowing the full delay.
  * Better practice is to use WaitLatch() with a timeout, so that backends
  * respond to latches and signals promptly.
+ *
+ * Some callers rely on errno being set to EINTR when interrupted.
  */
 void
 pg_usleep(long microsec)
@@ -49,6 +51,7 @@ pg_usleep(long microsec)
 		delay.tv_nsec = (microsec % 1000000L) * 1000;
 		(void) nanosleep(&delay, NULL);
 #else
+		/* will not be interrupted due to alertable = FALSE */
 		SleepEx((microsec < 500 ? 1 : (microsec + 500) / 1000), FALSE);
 #endif
 	}
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index f725d026317..e173131e5b4 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -137,7 +137,7 @@ perform_spin_delay(SpinDelayStatus *status)
 	/* Block the process every spins_per_delay tries */
 	if (++(status->spins) >= spins_per_delay)
 	{
-		if (++(status->delays) > NUM_DELAYS)
+		if (status->delays > NUM_DELAYS)
 			s_lock_stuck(status->file, status->line, status->func);
 
 		if (status->cur_delay == 0) /* first time to delay? */
@@ -152,9 +152,19 @@ perform_spin_delay(SpinDelayStatus *status)
 		 * this is better than nothing.
 		 */
 		pgstat_report_wait_start(WAIT_EVENT_SPIN_DELAY);
+		errno = 0;				/* see below */
 		pg_usleep(status->cur_delay);
 		pgstat_report_wait_end();
 
+		/*
+		 * Only count the sleep above as a delay, if not interrupted.
+		 * Otherwise a process getting signaled rapidly can trigger the stuck
+		 * spinlock logic in a fraction of the expected time. This isn't a
+		 * great fix for that problem, but doesn't require an API / ABI break.
+		 */
+		if (errno != EINTR)
+			status->delays++;
+
 #if defined(S_LOCK_TEST)
 		fprintf(stdout, "*");
 		fflush(stdout);
-- 
2.44.0.279.g3bd955d269