From dba5a2a57e5472682a9be6d4badf1e8127768610 Mon Sep 17 00:00:00 2001 From: David Steele Date: Thu, 3 Oct 2024 08:41:45 +0000 Subject: Return pg_control from pg_backup_stop(). Harden recovery by returning a copy of pg_control from pg_backup_stop() that has a flag set to prevent recovery if the backup_label file is missing. Instead of backup software copying pg_control from PGDATA, it stores an updated version that is returned from pg_backup_stop(). This is better for the following reasons: * The user can no longer remove backup_label and get what looks like a successful recovery (while almost certainly causing corruption). If backup_label is removed the cluster will not start. The user may try pg_resetwal, but that tool makes it pretty clear that corruption will result from its use. * We don't need to worry about backup software seeing a torn copy of pg_control, since Postgres can safely read it out of memory and provide a valid copy via pg_backup_stop(). This solves torn reads without needing to write pg_control via a temp file, which may affect performance on a standby. * For backup from standby, we no longer need to instruct the backup software to copy pg_control last. In fact the backup software should not copy pg_control from PGDATA at all. These changes have no impact on current backup software and they are free to use the pg_control available from pg_stop_backup() or continue to use pg_control from PGDATA. Of course they will miss the benefits of getting a consistent copy of pg_control and the backup_label checking, but will be no worse off than before. Catalog version bump is required. --- doc/src/sgml/backup.sgml | 18 +++++- doc/src/sgml/func.sgml | 5 +- src/backend/access/transam/xlogfuncs.c | 17 ++++-- src/backend/catalog/system_functions.sql | 2 +- src/include/catalog/pg_proc.dat | 4 +- src/test/recovery/t/042_low_level_backup.pl | 67 ++++++++++++++++++++- 6 files changed, 98 insertions(+), 15 deletions(-) diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index e4e4c56cf1..2fcf181121 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -1021,9 +1021,12 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); values. The second of these fields should be written to a file named backup_label in the root directory of the backup. The third field should be written to a file named - tablespace_map unless the field is empty. These files are + tablespace_map unless the field is empty. The fourth + field should be written into a file named + global/pg_control (overwriting the existing file when + present). These files are vital to the backup working and must be written byte for byte without - modification, which may require opening the file in binary mode. + modification, which will require opening the file in binary mode. @@ -1095,7 +1098,16 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); - You should, however, omit from the backup the files within the + You should exclude global/pg_control from your backup + and put the contents of the controlfile column + returned from pg_backup_stop in your backup at + global/pg_control. This version of pg_control contains + safeguards against recovery without backup_label present and is guaranteed + not to be torn. + + + + You should also omit from the backup the files within the cluster's pg_wal/ subdirectory. This slight adjustment is worthwhile because it reduces the risk of mistakes when restoring. This is easy to arrange if diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 786269b920..09204db7cd 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -28581,8 +28581,9 @@ LOG: Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560 The result of the function is a single record. The lsn column holds the backup's ending write-ahead log location (which again can be ignored). The second - column returns the contents of the backup label file, and the third - column returns the contents of the tablespace map file. These must be + column returns the contents of the backup label file, the third column + returns the contents of the tablespace map file, and the fourth column + returns the contents of pg_control. These must be stored as part of the backup and are required as part of the restore process. diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 3e3d2bb618..a7d7c1bcbe 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -115,9 +115,11 @@ pg_backup_start(PG_FUNCTION_ARGS) * * The backup_label contains the user-supplied label string (typically this * would be used to tell where the backup dump will be stored), the starting - * time, starting WAL location for the dump and so on. It is the caller's - * responsibility to write the backup_label and tablespace_map files in the - * data folder that will be restored from this backup. + * time, starting WAL location for the dump and so on. The pg_control file + * contains a consistent copy of pg_control that also has a safeguard against + * being used without backup_label. It is the caller's responsibility to write + * the backup_label, pg_control, and tablespace_map files in the data folder + * that will be restored from this backup. * * Permission checking for this function is managed through the normal * GRANT system. @@ -125,12 +127,13 @@ pg_backup_start(PG_FUNCTION_ARGS) Datum pg_backup_stop(PG_FUNCTION_ARGS) { -#define PG_BACKUP_STOP_V2_COLS 3 +#define PG_BACKUP_STOP_V2_COLS 4 TupleDesc tupdesc; Datum values[PG_BACKUP_STOP_V2_COLS] = {0}; bool nulls[PG_BACKUP_STOP_V2_COLS] = {0}; bool waitforarchive = PG_GETARG_BOOL(0); char *backup_label; + bytea *pg_control_bytea; SessionBackupState status = get_backup_status(); /* Initialize attributes information in the tuple descriptor */ @@ -152,9 +155,15 @@ pg_backup_stop(PG_FUNCTION_ARGS) /* Build the contents of backup_label */ backup_label = build_backup_content(backup_state, false); + /* Build the contents of pg_control */ + pg_control_bytea = (bytea *) palloc(PG_CONTROL_FILE_SIZE + VARHDRSZ); + SET_VARSIZE(pg_control_bytea, PG_CONTROL_FILE_SIZE + VARHDRSZ); + backup_control_file((uint8_t *)VARDATA(pg_control_bytea)); + values[0] = LSNGetDatum(backup_state->stoppoint); values[1] = CStringGetTextDatum(backup_label); values[2] = CStringGetTextDatum(tablespace_map->data); + values[3] = PointerGetDatum(pg_control_bytea); /* Deallocate backup-related variables */ pfree(backup_label); diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index b0d0de051e..758f90b2c3 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -390,7 +390,7 @@ CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION pg_backup_stop ( wait_for_archive boolean DEFAULT true, OUT lsn pg_lsn, - OUT labelfile text, OUT spcmapfile text) + OUT labelfile text, OUT spcmapfile text, OUT controlfile bytea) RETURNS record STRICT VOLATILE LANGUAGE internal as 'pg_backup_stop' PARALLEL RESTRICTED; diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 7cc5421ea1..ea93241eed 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -6569,8 +6569,8 @@ { oid => '2739', descr => 'finish taking an online backup', proname => 'pg_backup_stop', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => 'bool', - proallargtypes => '{bool,pg_lsn,text,text}', proargmodes => '{i,o,o,o}', - proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile}', + proallargtypes => '{bool,pg_lsn,text,text,bytea}', proargmodes => '{i,o,o,o,o}', + proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile,controlfile}', prosrc => 'pg_backup_stop' }, { oid => '3436', descr => 'promote standby server', proname => 'pg_promote', provolatile => 'v', prorettype => 'bool', diff --git a/src/test/recovery/t/042_low_level_backup.pl b/src/test/recovery/t/042_low_level_backup.pl index 61d23187e0..bd3a99960f 100644 --- a/src/test/recovery/t/042_low_level_backup.pl +++ b/src/test/recovery/t/042_low_level_backup.pl @@ -13,6 +13,42 @@ use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +# Decode hex to binary +sub decode_hex +{ + my ($encoded) = @_; + my $decoded; + + $encoded =~ s/^\s+|\s+$//g; + + for (my $idx = 0; $idx < length($encoded); $idx += 2) + { + $decoded .= pack('C', hex(substr($encoded, $idx, 2))); + } + + return $decoded; +} + +# Get backup_label/pg_control from pg_stop_backup() +sub stop_backup_result +{ + my ($psql) = @_; + + my $encoded = $psql->query_safe( + "select encode(labelfile::bytea, 'hex') || ',' || " . + " encode(controlfile, 'hex')" . + " from pg_backup_stop()"); + + my @result; + + foreach my $column (split(',', $encoded)) + { + push(@result, decode_hex($column)); + } + + return @result; +} + # Start primary node with archiving. my $node_primary = PostgreSQL::Test::Cluster->new('primary'); $node_primary->init(has_archiving => 1, allows_streaming => 1); @@ -80,8 +116,7 @@ my $stop_segment_name = $node_primary->safe_psql('postgres', 'SELECT pg_walfile_name(pg_current_wal_lsn())'); # Stop backup and get backup_label, the last segment is archived. -my $backup_label = - $psql->query_safe("select labelfile from pg_backup_stop()"); +(my $backup_label, my $pg_control) = stop_backup_result($psql); $psql->quit; @@ -118,10 +153,36 @@ ok( $node_replica->log_contains( $node_replica->teardown_node; $node_replica->clean_node; +# Save only pg_control into the backup to demonstrate that pg_control returned +# from pg_stop_backup() will only perform recovery when backup_label is present. +open(my $fh, ">", "$backup_dir/global/pg_control") + or die "could not open pg_control"; +binmode($fh); +syswrite($fh, $pg_control); +close($fh); + +$node_replica = PostgreSQL::Test::Cluster->new('replica_fail2'); +$node_replica->init_from_backup($node_primary, $backup_name, + has_restoring => 1); + +my $res = run_log( + [ + 'pg_ctl', '-D', $node_replica->data_dir, '-l', + $node_replica->logfile, 'start' + ]); +ok(!$res, 'invalid recovery startup fails'); + +my $logfile = slurp_file($node_replica->logfile()); +ok($logfile =~ qr/could not find backup_label required for recovery/, + 'could not find backup_label required for recovery'); + +$node_replica->teardown_node; +$node_replica->clean_node; + # Save backup_label into the backup directory and recover using the primary's # archive. This time recovery will succeed and the canary table will be # present. -open my $fh, ">>", "$backup_dir/backup_label" +open $fh, ">>", "$backup_dir/backup_label" or die "could not open backup_label"; # Binary mode is required for Windows, as the backup_label parsing is not # able to cope with CRLFs. -- 2.34.1