From a5c6ef80cae1fb7606ff46679422238fdceb5cc8 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Tue, 30 May 2023 07:54:02 +0000 Subject: [PATCH v1] Ensure vacuum did remove dead rows in 035_standby_logical_decoding We want to ensure that vacuum was able to remove dead rows (aka no other transactions holding back global xmin) before testing for slots invalidation on the standby. --- .../t/035_standby_logical_decoding.pl | 263 ++++++++++-------- 1 file changed, 150 insertions(+), 113 deletions(-) 100.0% src/test/recovery/t/ diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index 64beec4bd3..d26d9bcace 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -9,6 +9,7 @@ use warnings; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +use Time::HiRes qw(usleep); my ($stdin, $stdout, $stderr, $cascading_stdout, $cascading_stderr, $subscriber_stdin, @@ -23,6 +24,9 @@ my $node_subscriber = PostgreSQL::Test::Cluster->new('subscriber'); my $default_timeout = $PostgreSQL::Test::Utils::timeout_default; my $psql_timeout = IPC::Run::timer($default_timeout); my $res; +my ($vacret, $vacstdout, $vacstderr) = ('', '', ''); +my $nb_vac = 0; +my $vac_attempts = 10; # Name for the physical slot on primary my $primary_slotname = 'primary_physical'; @@ -185,7 +189,7 @@ sub change_hot_standby_feedback_and_wait_for_xmins # Check conflicting status in pg_replication_slots. sub check_slots_conflicting_status { - my ($conflicting) = @_; + my ($conflicting, $details) = @_; if ($conflicting) { @@ -193,7 +197,7 @@ sub check_slots_conflicting_status 'postgres', qq( select bool_and(conflicting) from pg_replication_slots;)); - is($res, 't', "Logical slots are reported as conflicting"); + is($res, 't', "logical slots are reported as conflicting $details"); } else { @@ -201,7 +205,7 @@ sub check_slots_conflicting_status 'postgres', qq( select bool_or(conflicting) from pg_replication_slots;)); - is($res, 'f', "Logical slots are reported as non conflicting"); + is($res, 'f', "logical slots are reported as non conflicting $details"); } } @@ -256,6 +260,29 @@ sub check_for_invalidation ) or die "Timed out waiting confl_active_logicalslot to be updated"; } +# Try ($vac_attempts times max) to vacuum until it is able to remove dead rows. +sub vacuum_until_can_remove +{ + my ($vac_option, $sql, $to_vac) = @_; + + ($vacret, $vacstdout, $vacstderr) = ('', '', ''); + $nb_vac = 0; + + while ($nb_vac <= $vac_attempts) + { + last if ($vacstderr =~ /0 dead row versions cannot be removed yet/ || + $vacstderr =~ /0 are dead but not yet removable/); + # This should trigger the conflict + ($vacret, $vacstdout, $vacstderr) = $node_primary->psql( + 'testdb', qq[ + $sql + VACUUM $vac_option verbose $to_vac; + INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal + ]); + $nb_vac++; + usleep(100_000); + } +} ######################## # Initialize primary node ######################## @@ -266,6 +293,7 @@ $node_primary->append_conf( wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 +autovacuum = off }); $node_primary->dump_info; $node_primary->start; @@ -485,30 +513,31 @@ $node_subscriber->stop; reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_', 0, 1); -# This should trigger the conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE TABLE conflict_test(x integer, y text); - DROP TABLE conflict_test; - VACUUM full pg_class; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +vacuum_until_can_remove('full', 'CREATE TABLE conflict_test(x integer, y text); + DROP TABLE conflict_test;', 'pg_class'); $node_primary->wait_for_replay_catchup($node_standby); -# Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class'); +SKIP: +{ + # Checking for invalidation only if vacuum did remove dead rows + skip 'vacuum full did not remove dead rows', 1 + unless ($nb_vac < $vac_attempts); -# Verify slots are reported as conflicting in pg_replication_slots -check_slots_conflicting_status(1); + # Check invalidation in the logfile and in pg_stat_database_conflicts + check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class'); -$handle = - make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr); + # Verify slots are reported as conflicting in pg_replication_slots + check_slots_conflicting_status(1, 'after vacuum FULL on pg_class'); -# We are not able to read from the slot as it has been invalidated -check_pg_recvlogical_stderr($handle, - "can no longer get changes from replication slot \"vacuum_full_activeslot\"" -); + $handle = + make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr); + + # We are not able to read from the slot as it has been invalidated + check_pg_recvlogical_stderr($handle, + "can no longer get changes from replication slot \"vacuum_full_activeslot\"" + ); +} # Turn hot_standby_feedback back on change_hot_standby_feedback_and_wait_for_xmins(1, 1); @@ -518,45 +547,49 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1); ################################################## $node_standby->restart; -# Verify slots are reported as conflicting in pg_replication_slots -check_slots_conflicting_status(1); +SKIP: +{ + skip 'vacuum full on pg_class did not remove dead rows', 1 + unless ($nb_vac < $vac_attempts); + # Verify slots are reported as conflicting in pg_replication_slots + check_slots_conflicting_status(1, 'after a restart'); ################################################## # Verify that invalidated logical slots do not lead to retaining WAL. ################################################## -# Get the restart_lsn from an invalidated slot -my $restart_lsn = $node_standby->safe_psql('postgres', - "SELECT restart_lsn from pg_replication_slots WHERE slot_name = 'vacuum_full_activeslot' and conflicting is true;" -); - -chomp($restart_lsn); + # Get the restart_lsn from an invalidated slot + my $restart_lsn = $node_standby->safe_psql('postgres', + "SELECT restart_lsn from pg_replication_slots WHERE slot_name = 'vacuum_full_activeslot' and conflicting is true;" + ); -# As pg_walfile_name() can not be executed on the standby, -# get the WAL file name associated to this lsn from the primary -my $walfile_name = $node_primary->safe_psql('postgres', - "SELECT pg_walfile_name('$restart_lsn')"); + chomp($restart_lsn); -chomp($walfile_name); + # As pg_walfile_name() can not be executed on the standby, + # get the WAL file name associated to this lsn from the primary + my $walfile_name = $node_primary->safe_psql('postgres', + "SELECT pg_walfile_name('$restart_lsn')"); -# Generate some activity and switch WAL file on the primary -$node_primary->safe_psql( - 'postgres', "create table retain_test(a int); - select pg_switch_wal(); - insert into retain_test values(1); - checkpoint;"); + chomp($walfile_name); -# Wait for the standby to catch up -$node_primary->wait_for_replay_catchup($node_standby); + # Generate some activity and switch WAL file on the primary + $node_primary->safe_psql( + 'postgres', "create table retain_test(a int); + select pg_switch_wal(); + insert into retain_test values(1); + checkpoint;"); -# Request a checkpoint on the standby to trigger the WAL file(s) removal -$node_standby->safe_psql('postgres', 'checkpoint;'); + # Wait for the standby to catch up + $node_primary->wait_for_replay_catchup($node_standby); -# Verify that the WAL file has not been retained on the standby -my $standby_walfile = $node_standby->data_dir . '/pg_wal/' . $walfile_name; -ok(!-f "$standby_walfile", - "invalidated logical slots do not lead to retaining WAL"); + # Request a checkpoint on the standby to trigger the WAL file(s) removal + $node_standby->safe_psql('postgres', 'checkpoint;'); + # Verify that the WAL file has not been retained on the standby + my $standby_walfile = $node_standby->data_dir . '/pg_wal/' . $walfile_name; + ok(!-f "$standby_walfile", + "invalidated logical slots do not lead to retaining WAL"); +} ################################################## # Recovery conflict: Invalidate conflicting slots, including in-use slots # Scenario 2: conflict due to row removal with hot_standby_feedback off. @@ -571,29 +604,30 @@ reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_', 0, 1); # This should trigger the conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE TABLE conflict_test(x integer, y text); - DROP TABLE conflict_test; - VACUUM pg_class; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +vacuum_until_can_remove('', 'CREATE TABLE conflict_test(x integer, y text); + DROP TABLE conflict_test;', 'pg_class'); $node_primary->wait_for_replay_catchup($node_standby); -# Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class'); +SKIP: +{ + skip 'vacuum on pg_class did not remove dead rows', 1 + unless ($nb_vac < $vac_attempts); -# Verify slots are reported as conflicting in pg_replication_slots -check_slots_conflicting_status(1); + # Check invalidation in the logfile and in pg_stat_database_conflicts + check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class'); -$handle = - make_slot_active($node_standby, 'row_removal_', 0, \$stdout, \$stderr); + # Verify slots are reported as conflicting in pg_replication_slots + check_slots_conflicting_status(1, 'after vacuum on pg_class'); -# We are not able to read from the slot as it has been invalidated -check_pg_recvlogical_stderr($handle, - "can no longer get changes from replication slot \"row_removal_activeslot\"" -); + $handle = + make_slot_active($node_standby, 'row_removal_', 0, \$stdout, \$stderr); + + # We are not able to read from the slot as it has been invalidated + check_pg_recvlogical_stderr($handle, + "can no longer get changes from replication slot \"row_removal_activeslot\"" + ); +} ################################################## # Recovery conflict: Same as Scenario 2 but on a shared catalog table @@ -609,30 +643,30 @@ reactive_slots_change_hfs_and_wait_for_xmins('row_removal_', 'shared_row_removal_', 0, 1); # Trigger the conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE ROLE create_trash; - DROP ROLE create_trash; - VACUUM pg_authid; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +vacuum_until_can_remove('', 'CREATE ROLE create_trash; + DROP ROLE create_trash;', 'pg_authid'); $node_primary->wait_for_replay_catchup($node_standby); +SKIP: +{ + skip 'vacuum on pg_authid did not remove dead rows', 1 + unless ($nb_vac < $vac_attempts); -# Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('shared_row_removal_', $logstart, - 'with vacuum on pg_authid'); + # Check invalidation in the logfile and in pg_stat_database_conflicts + check_for_invalidation('shared_row_removal_', $logstart, + 'with vacuum on pg_authid'); -# Verify slots are reported as conflicting in pg_replication_slots -check_slots_conflicting_status(1); + # Verify slots are reported as conflicting in pg_replication_slots + check_slots_conflicting_status(1, 'after vacuum on pg_authid'); -$handle = make_slot_active($node_standby, 'shared_row_removal_', 0, \$stdout, - \$stderr); + $handle = make_slot_active($node_standby, 'shared_row_removal_', 0, \$stdout, + \$stderr); -# We are not able to read from the slot as it has been invalidated -check_pg_recvlogical_stderr($handle, - "can no longer get changes from replication slot \"shared_row_removal_activeslot\"" -); + # We are not able to read from the slot as it has been invalidated + check_pg_recvlogical_stderr($handle, + "can no longer get changes from replication slot \"shared_row_removal_activeslot\"" + ); +} ################################################## # Recovery conflict: Same as Scenario 2 but on a non catalog table @@ -646,39 +680,42 @@ reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', 'no_conflict_', 0, 1); # This should not trigger a conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE TABLE conflict_test(x integer, y text); - INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s; - UPDATE conflict_test set x=1, y=1; - VACUUM conflict_test; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +vacuum_until_can_remove('', 'CREATE TABLE conflict_test(x integer, y text); + INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s; + UPDATE conflict_test set x=1, y=1;', 'conflict_test'); + $node_primary->wait_for_replay_catchup($node_standby); -# message should not be issued -ok( !find_in_log( - $node_standby, - "invalidating obsolete slot \"no_conflict_inactiveslot\"", $logstart), - 'inactiveslot slot invalidation is not logged with vacuum on conflict_test' -); +SKIP: +{ + # Checking for invalidation only if vacuum did remove dead rows + skip 'vacuum did not remove dead rows on a non catalog table', 1 + unless ($nb_vac < $vac_attempts); -ok( !find_in_log( - $node_standby, - "invalidating obsolete slot \"no_conflict_activeslot\"", $logstart), - 'activeslot slot invalidation is not logged with vacuum on conflict_test' -); + # message should not be issued + ok( !find_in_log( + $node_standby, + "invalidating obsolete slot \"no_conflict_inactiveslot\"", $logstart), + 'inactiveslot slot invalidation is not logged with vacuum on conflict_test' + ); -# Verify that pg_stat_database_conflicts.confl_active_logicalslot has not been updated -ok( $node_standby->poll_query_until( - 'postgres', - "select (confl_active_logicalslot = 0) from pg_stat_database_conflicts where datname = 'testdb'", - 't'), - 'confl_active_logicalslot not updated' -) or die "Timed out waiting confl_active_logicalslot to be updated"; + ok( !find_in_log( + $node_standby, + "invalidating obsolete slot \"no_conflict_activeslot\"", $logstart), + 'activeslot slot invalidation is not logged with vacuum on conflict_test' + ); -# Verify slots are reported as non conflicting in pg_replication_slots -check_slots_conflicting_status(0); + # Verify that pg_stat_database_conflicts.confl_active_logicalslot has not been updated + ok( $node_standby->poll_query_until( + 'postgres', + "select (confl_active_logicalslot = 0) from pg_stat_database_conflicts where datname = 'testdb'", + 't'), + 'confl_active_logicalslot not updated' + ) or die "Timed out waiting confl_active_logicalslot to be updated"; + + # Verify slots are reported as non conflicting in pg_replication_slots + check_slots_conflicting_status(0, 'after vacuum on a non catalog table'); +} # Turn hot_standby_feedback back on change_hot_standby_feedback_and_wait_for_xmins(1, 0); @@ -715,7 +752,7 @@ $node_primary->wait_for_replay_catchup($node_standby); check_for_invalidation('pruning_', $logstart, 'with on-access pruning'); # Verify slots are reported as conflicting in pg_replication_slots -check_slots_conflicting_status(1); +check_slots_conflicting_status(1, 'after on-access pruning'); $handle = make_slot_active($node_standby, 'pruning_', 0, \$stdout, \$stderr); @@ -759,7 +796,7 @@ $node_primary->wait_for_replay_catchup($node_standby); check_for_invalidation('wal_level_', $logstart, 'due to wal_level'); # Verify slots are reported as conflicting in pg_replication_slots -check_slots_conflicting_status(1); +check_slots_conflicting_status(1, 'due to wal_level'); $handle = make_slot_active($node_standby, 'wal_level_', 0, \$stdout, \$stderr); -- 2.34.1