From b45f0c55d17cddfac7c99d11000b819c8b09fa56 Mon Sep 17 00:00:00 2001
From: bdrouvot <bdrouvot@gmail.com>
Date: Tue, 9 Jan 2024 05:08:35 +0000
Subject: [PATCH v7] Fix 035_standby_logical_decoding.pl race condition

We want to ensure that vacuum was able to remove dead rows (aka no other
transactions holding back global xmin) before testing for slots invalidation on
the standby.

While at it, also fixing some typos/bad test description in it.
---
 .../t/035_standby_logical_decoding.pl         | 68 ++++++++++---------
 1 file changed, 35 insertions(+), 33 deletions(-)
 100.0% src/test/recovery/t/

diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index 8bc39a5f03..94406ad960 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -238,6 +238,25 @@ sub check_for_invalidation
 	) or die "Timed out waiting confl_active_logicalslot to be updated";
 }
 
+# Launch $sql and wait for a new snapshot that has a newer horizon before
+# doing the vacuum with $vac_option on $to_vac.
+sub wait_until_vacuum_can_remove
+{
+	my ($vac_option, $sql, $to_vac) = @_;
+
+	# get the current xid horizon
+	my $xid_horizon = $node_primary->safe_psql('testdb', qq[select pg_snapshot_xmin(pg_current_snapshot());]);
+
+	# launch our sql
+	$node_primary->safe_psql('testdb', qq[$sql]);
+
+	$node_primary->poll_query_until('testdb',
+		"SELECT (select pg_snapshot_xmin(pg_current_snapshot())::text::int - $xid_horizon) > 0")
+	  or die "new snapshot does not have a newer horizon";
+
+	$node_primary->safe_psql('testdb', qq[VACUUM $vac_option verbose $to_vac;
+										  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal;]);
+}
 ########################
 # Initialize primary node
 ########################
@@ -248,6 +267,7 @@ $node_primary->append_conf(
 wal_level = 'logical'
 max_replication_slots = 4
 max_wal_senders = 4
+autovacuum = off
 });
 $node_primary->dump_info;
 $node_primary->start;
@@ -468,13 +488,8 @@ reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_',
 	0, 1);
 
 # This should trigger the conflict
-$node_primary->safe_psql(
-	'testdb', qq[
-  CREATE TABLE conflict_test(x integer, y text);
-  DROP TABLE conflict_test;
-  VACUUM full pg_class;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove('full', 'CREATE TABLE conflict_test(x integer, y text);
+								 DROP TABLE conflict_test;', 'pg_class');
 
 $node_primary->wait_for_replay_catchup($node_standby);
 
@@ -550,13 +565,8 @@ reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_',
 	0, 1);
 
 # This should trigger the conflict
-$node_primary->safe_psql(
-	'testdb', qq[
-  CREATE TABLE conflict_test(x integer, y text);
-  DROP TABLE conflict_test;
-  VACUUM pg_class;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove('', 'CREATE TABLE conflict_test(x integer, y text);
+							 DROP TABLE conflict_test;', 'pg_class');
 
 $node_primary->wait_for_replay_catchup($node_standby);
 
@@ -582,19 +592,15 @@ check_pg_recvlogical_stderr($handle,
 # get the position to search from in the standby logfile
 $logstart = -s $node_standby->logfile;
 
-# One way to produce recovery conflict is to create/drop a relation and
-# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
+# One way to produce recovery conflict on a shared catalog table is to create/drop
+# a role and launch a vacuum on pg_authid with hot_standby_feedback turned off on
+# the standby.
 reactive_slots_change_hfs_and_wait_for_xmins('row_removal_',
 	'shared_row_removal_', 0, 1);
 
 # Trigger the conflict
-$node_primary->safe_psql(
-	'testdb', qq[
-  CREATE ROLE create_trash;
-  DROP ROLE create_trash;
-  VACUUM pg_authid;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove('', 'CREATE ROLE create_trash;
+							 DROP ROLE create_trash;', 'pg_authid');
 
 $node_primary->wait_for_replay_catchup($node_standby);
 
@@ -625,14 +631,10 @@ reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
 	'no_conflict_', 0, 1);
 
 # This should not trigger a conflict
-$node_primary->safe_psql(
-	'testdb', qq[
-  CREATE TABLE conflict_test(x integer, y text);
-  INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
-  UPDATE conflict_test set x=1, y=1;
-  VACUUM conflict_test;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove('', 'CREATE TABLE conflict_test(x integer, y text);
+							 INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
+							 UPDATE conflict_test set x=1, y=1;', 'conflict_test');
+
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # message should not be issued
@@ -671,7 +673,7 @@ $node_standby->restart;
 
 ##################################################
 # Recovery conflict: Invalidate conflicting slots, including in-use slots
-# Scenario 4: conflict due to on-access pruning.
+# Scenario 5: conflict due to on-access pruning.
 ##################################################
 
 # get the position to search from in the standby logfile
@@ -711,7 +713,7 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 
 ##################################################
 # Recovery conflict: Invalidate conflicting slots, including in-use slots
-# Scenario 5: incorrect wal_level on primary.
+# Scenario 6: incorrect wal_level on primary.
 ##################################################
 
 # get the position to search from in the standby logfile
-- 
2.34.1

