From 45751bcfc2738725801a4ebf27b1b5abf4883ab5 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 15 Jul 2021 15:03:31 -0400
Subject: [PATCH v5] Advance old-segment horizon properly after slot
 invalidation

When some slots are invalidated due to the max_slot_wal_keep_size limit,
the old segment horizon should move forward to stay within the limit.
However, in commit c6550776394e we forgot to call KeepLogSeg again to
recompute the horizon after invalidating replication slots.  In cases
where other slots remained, the limits would be recomputed eventually
for other reasons, but if all slots were invalidated, the limits would
not move at all afterwards.  Repair.

Backpatch to 13 where the feature was introduced.

Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reported-by: Marcin Krupowicz <mk@071.ovh>
Discussion: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
---
 src/backend/replication/slot.c            | 20 ++++++++++++++++++--
 src/test/recovery/t/019_replslot_limit.pl | 15 ++++++++++++++-
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 33b85d86cc..62d52bbce1 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1143,11 +1143,14 @@ ReplicationSlotReserveWal(void)
  * Returns whether ReplicationSlotControlLock was released in the interim (and
  * in that case we're not holding the lock at return, otherwise we are).
  *
+ * Sets *invalidated true if the slot was invalidated. (Untouched otherwise.)
+ *
  * This is inherently racy, because we release the LWLock
  * for syscalls, so caller must restart if we return true.
  */
 static bool
-InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
+InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
+							   bool *invalidated)
 {
 	int			last_signaled_pid = 0;
 	bool		released_lock = false;
@@ -1204,6 +1207,9 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
 			s->active_pid = MyProcPid;
 			s->data.invalidated_at = restart_lsn;
 			s->data.restart_lsn = InvalidXLogRecPtr;
+
+			/* Let caller know */
+			*invalidated = true;
 		}
 
 		SpinLockRelease(&s->mutex);
@@ -1297,6 +1303,7 @@ void
 InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 {
 	XLogRecPtr	oldestLSN;
+	bool		invalidated = false;
 
 	XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
 
@@ -1309,13 +1316,22 @@ restart:
 		if (!s->in_use)
 			continue;
 
-		if (InvalidatePossiblyObsoleteSlot(s, oldestLSN))
+		if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
 		{
 			/* if the lock was released, start from scratch */
 			goto restart;
 		}
 	}
 	LWLockRelease(ReplicationSlotControlLock);
+
+	/*
+	 * If any slots have been invalidated, recalculate the resource limits.
+	 */
+	if (invalidated)
+	{
+		ReplicationSlotsComputeRequiredXmin(false);
+		ReplicationSlotsComputeRequiredLSN();
+	}
 }
 
 /*
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index d4b9ff705f..72cb436791 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -11,7 +11,7 @@ use TestLib;
 use PostgresNode;
 
 use File::Path qw(rmtree);
-use Test::More tests => $TestLib::windows_os ? 14 : 18;
+use Test::More tests => $TestLib::windows_os ? 15 : 19;
 use Time::HiRes qw(usleep);
 
 $ENV{PGDATABASE} = 'postgres';
@@ -192,6 +192,19 @@ $result = $node_primary->safe_psql('postgres',
 is($result, "rep1|f|t|lost|",
 	'check that the slot became inactive and the state "lost" persists');
 
+# The invalidated slot shouldn't keep the old-segment horizon back;
+# see bug #17103: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
+# Test for this by creating a new slot and comparing its restart LSN
+# to the oldest existing file.
+my $redoseg = $node_primary->safe_psql('postgres',
+	"SELECT pg_walfile_name(lsn) FROM pg_create_physical_replication_slot('xx', true)"
+);
+my $oldestseg = $node_primary->safe_psql('postgres',
+	"SELECT pg_ls_dir AS f FROM pg_ls_dir('pg_wal') WHERE pg_ls_dir ~ '^[0-9A-F]{24}\$' ORDER BY 1 LIMIT 1"
+);
+
+is($oldestseg, $redoseg, "check that segments have been removed");
+
 # The standby no longer can connect to the primary
 $logstart = get_log_size($node_standby);
 $node_standby->start;
-- 
2.20.1

