From 531872dcdb09f5d2f89af44a3c04c9d2d6da89be Mon Sep 17 00:00:00 2001 From: David Steele Date: Fri, 17 May 2024 02:42:35 +0000 Subject: Return pg_control from pg_backup_stop(). Harden recovery by returning a copy of pg_control from pg_backup_stop() that has a flag set to prevent recovery if the backup_label file is missing. Instead of backup software copying pg_control from PGDATA, it stores an updated version that is returned from pg_backup_stop(). This is better for the following reasons: * The user can no longer remove backup_label and get what looks like a successful recovery (while almost certainly causing corruption). If backup_label is removed the cluster will not start. The user may try pg_resetwal, but that tool makes it pretty clear that corruption will result from its use. * We don't need to worry about backup software seeing a torn copy of pg_control, since Postgres can safely read it out of memory and provide a valid copy via pg_backup_stop(). This solves torn reads without needing to write pg_control via a temp file, which may affect performance on a standby. * For backup from standby, we no longer need to instruct the backup software to copy pg_control last. In fact the backup software should not copy pg_control from PGDATA at all. These changes have no impact on current backup software and they are free to use the pg_control available from pg_stop_backup() or continue to use pg_control from PGDATA. Of course they will miss the benefits of getting a consistent copy of pg_control and the backup_label checking, but will be no worse off than before. Control and catalog version bumps are required. --- doc/src/sgml/backup.sgml | 18 +++++- doc/src/sgml/func.sgml | 10 ++- src/backend/access/transam/xlog.c | 19 ++++++ src/backend/access/transam/xlogfuncs.c | 17 ++++-- src/backend/access/transam/xlogrecovery.c | 10 ++- src/backend/backup/basebackup.c | 19 +++--- src/backend/catalog/system_functions.sql | 2 +- src/backend/utils/misc/pg_controldata.c | 7 ++- src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_resetwal/pg_resetwal.c | 1 + src/bin/pg_rewind/pg_rewind.c | 1 + src/include/access/xlogbackup.h | 8 +++ src/include/catalog/pg_control.h | 4 ++ src/include/catalog/pg_proc.dat | 10 +-- src/test/recovery/t/002_archiving.pl | 20 ++++++ src/test/recovery/t/042_low_level_backup.pl | 67 ++++++++++++++++++++- 16 files changed, 182 insertions(+), 33 deletions(-) diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 91da3c26ba..1b87a05dc5 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -1010,9 +1010,12 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); values. The second of these fields should be written to a file named backup_label in the root directory of the backup. The third field should be written to a file named - tablespace_map unless the field is empty. These files are + tablespace_map unless the field is empty. The fourth + field should be written into a file named + global/pg_control (overwriting the existing file when + present). These files are vital to the backup working and must be written byte for byte without - modification, which may require opening the file in binary mode. + modification, which will require opening the file in binary mode. @@ -1084,7 +1087,16 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); - You should, however, omit from the backup the files within the + You should exclude global/pg_control from your backup + and put the contents of the controlfile column + returned from pg_backup_stop in your backup at + global/pg_control. This version of pg_control contains + safeguards against recovery without backup_label present and is guaranteed + not to be torn. + + + + You should also omit from the backup the files within the cluster's pg_wal/ subdirectory. This slight adjustment is worthwhile because it reduces the risk of mistakes when restoring. This is easy to arrange if diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 17c44bc338..88f942af44 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -27697,6 +27697,11 @@ SELECT currval(pg_get_serial_sequence('sometable', 'id')); boolean + + backup_label_required + boolean + + @@ -28355,8 +28360,9 @@ LOG: Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560 The result of the function is a single record. The lsn column holds the backup's ending write-ahead log location (which again can be ignored). The second - column returns the contents of the backup label file, and the third - column returns the contents of the tablespace map file. These must be + column returns the contents of the backup label file, the third column + returns the contents of the tablespace map file, and the fourth column + returns the contents of pg_control. These must be stored as part of the backup and are required as part of the restore process. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c3fd9c1eae..7d7c3d727e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9048,11 +9048,30 @@ do_pg_backup_stop(BackupState *state, bool waitforarchive) int seconds_before_warning; int waits = 0; bool reported_waiting = false; + ControlFileData *controlFileCopy = (ControlFileData *)state->controlFile; Assert(state != NULL); backup_stopped_in_recovery = RecoveryInProgress(); + /* + * Create a copy of control data and update it to require a backup label + * for recovery. Also recalculate the CRC. + */ + memset( + state->controlFile + sizeof(ControlFileData), 0, + PG_CONTROL_FILE_SIZE - sizeof(ControlFileData)); + + LWLockAcquire(ControlFileLock, LW_SHARED); + memcpy(controlFileCopy, ControlFile, sizeof(ControlFileData)); + LWLockRelease(ControlFileLock); + + controlFileCopy->backupLabelRequired = true; + + INIT_CRC32C(controlFileCopy->crc); + COMP_CRC32C(controlFileCopy->crc, controlFileCopy, offsetof(ControlFileData, crc)); + FIN_CRC32C(controlFileCopy->crc); + /* * During recovery, we don't need to check WAL level. Because, if WAL * level is not sufficient, it's impossible to get here during recovery. diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 92bdb17ed5..2576cacf22 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -112,9 +112,11 @@ pg_backup_start(PG_FUNCTION_ARGS) * * The backup_label contains the user-supplied label string (typically this * would be used to tell where the backup dump will be stored), the starting - * time, starting WAL location for the dump and so on. It is the caller's - * responsibility to write the backup_label and tablespace_map files in the - * data folder that will be restored from this backup. + * time, starting WAL location for the dump and so on. The pg_control file + * contains represents a consistent copy of pg_control that also has a safeguard + * against being used without backup_label. It is the caller's responsibility + * to write the backup_label, pg_control, and tablespace_map files in the data + * folder that will be restored from this backup. * * Permission checking for this function is managed through the normal * GRANT system. @@ -122,12 +124,13 @@ pg_backup_start(PG_FUNCTION_ARGS) Datum pg_backup_stop(PG_FUNCTION_ARGS) { -#define PG_BACKUP_STOP_V2_COLS 3 +#define PG_BACKUP_STOP_V2_COLS 4 TupleDesc tupdesc; Datum values[PG_BACKUP_STOP_V2_COLS] = {0}; bool nulls[PG_BACKUP_STOP_V2_COLS] = {0}; bool waitforarchive = PG_GETARG_BOOL(0); char *backup_label; + bytea *pg_control_bytea; SessionBackupState status = get_backup_status(); /* Initialize attributes information in the tuple descriptor */ @@ -149,9 +152,15 @@ pg_backup_stop(PG_FUNCTION_ARGS) /* Build the contents of backup_label */ backup_label = build_backup_content(backup_state, false); + /* Build the contents of pg_control */ + pg_control_bytea = (bytea *) palloc(PG_CONTROL_FILE_SIZE + VARHDRSZ); + SET_VARSIZE(pg_control_bytea, PG_CONTROL_FILE_SIZE + VARHDRSZ); + memcpy(VARDATA(pg_control_bytea), backup_state->controlFile, PG_CONTROL_FILE_SIZE); + values[0] = LSNGetDatum(backup_state->stoppoint); values[1] = CStringGetTextDatum(backup_label); values[2] = CStringGetTextDatum(tablespace_map->data); + values[3] = PointerGetDatum(pg_control_bytea); /* Deallocate backup-related variables */ pfree(backup_label); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 29c5bec084..0940e4730e 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -703,7 +703,14 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, } else { - /* No backup_label file has been found if we are here. */ + /* + * No backup_label file has been found if we are here. Error if the + * control file requires backup_label. + */ + if (ControlFile->backupLabelRequired) + ereport(FATAL, + (errmsg("could not find backup_label required for recovery"), + errhint("backup_label must be present for recovery to succeed"))); /* * If tablespace_map file is present without backup_label file, there @@ -976,6 +983,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, { ControlFile->backupStartPoint = checkPoint.redo; ControlFile->backupEndRequired = backupEndRequired; + ControlFile->backupLabelRequired = false; if (backupFromStandby) { diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 9a2bf59e84..ae0f83416a 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -324,7 +324,6 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, if (ti->path == NULL) { - struct stat statbuf; bool sendtblspclinks = true; char *backup_label; @@ -348,15 +347,14 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, sendDir(sink, ".", 1, false, state.tablespaces, sendtblspclinks, &manifest, InvalidOid, ib); + /* End the backup before sending pg_control */ + basebackup_progress_wait_wal_archive(&state); + do_pg_backup_stop(backup_state, !opt->nowait); + /* ... and pg_control after everything else. */ - if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", - XLOG_CONTROL_FILE))); - sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, - false, InvalidOid, InvalidOid, - InvalidRelFileNumber, 0, &manifest, 0, NULL, 0); + sendFileWithContent(sink, XLOG_CONTROL_FILE, + (char *)backup_state->controlFile, + PG_CONTROL_FILE_SIZE, &manifest); } else { @@ -390,9 +388,6 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, } } - basebackup_progress_wait_wal_archive(&state); - do_pg_backup_stop(backup_state, !opt->nowait); - endptr = backup_state->stoppoint; endtli = backup_state->stoptli; diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index ae099e328c..d76d3c608f 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -390,7 +390,7 @@ CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION pg_backup_stop ( wait_for_archive boolean DEFAULT true, OUT lsn pg_lsn, - OUT labelfile text, OUT spcmapfile text) + OUT labelfile text, OUT spcmapfile text, OUT controlfile bytea) RETURNS record STRICT VOLATILE LANGUAGE internal as 'pg_backup_stop' PARALLEL RESTRICTED; diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 98c932dc7b..9eaf3f8b9f 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -162,8 +162,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) Datum pg_control_recovery(PG_FUNCTION_ARGS) { - Datum values[5]; - bool nulls[5]; + Datum values[6]; + bool nulls[6]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; @@ -195,6 +195,9 @@ pg_control_recovery(PG_FUNCTION_ARGS) values[4] = BoolGetDatum(ControlFile->backupEndRequired); nulls[4] = false; + values[5] = BoolGetDatum(ControlFile->backupLabelRequired); + nulls[5] = false; + htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 93e0837947..2114488dc0 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -283,6 +283,8 @@ main(int argc, char *argv[]) LSN_FORMAT_ARGS(ControlFile->backupEndPoint)); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); + printf(_("Backup label required: %s\n"), + ControlFile->backupLabelRequired ? _("yes") : _("no")); printf(_("wal_level setting: %s\n"), wal_level_str(ControlFile->wal_level)); printf(_("wal_log_hints setting: %s\n"), diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index e9dcb5a6d8..7056752cff 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -875,6 +875,7 @@ RewriteControlFile(void) ControlFile.backupStartPoint = 0; ControlFile.backupEndPoint = 0; ControlFile.backupEndRequired = false; + ControlFile.backupLabelRequired = false; /* * Force the defaults for max_* settings. The values don't really matter diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 8449ae78ef..7c27d47110 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -722,6 +722,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source, ControlFile_new.minRecoveryPoint = endrec; ControlFile_new.minRecoveryPointTLI = endtli; ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY; + ControlFile_new.backupLabelRequired = true; if (!dry_run) update_controlfile(datadir_target, &ControlFile_new, do_sync); } diff --git a/src/include/access/xlogbackup.h b/src/include/access/xlogbackup.h index c30d4a5991..93a0e5c5cc 100644 --- a/src/include/access/xlogbackup.h +++ b/src/include/access/xlogbackup.h @@ -15,6 +15,7 @@ #define XLOG_BACKUP_H #include "access/xlogdefs.h" +#include "catalog/pg_control.h" #include "pgtime.h" /* Structure to hold backup state. */ @@ -35,6 +36,13 @@ typedef struct BackupState XLogRecPtr stoppoint; /* backup stop WAL location */ TimeLineID stoptli; /* backup stop TLI */ pg_time_t stoptime; /* backup stop time */ + + /* + * After pg_backup_stop() returns this field will contain a copy of + * pg_control that should be stored with the backup. backupLabelRequired + * has been set so backup_label will be required for recovery to start. + */ + uint8_t controlFile[PG_CONTROL_FILE_SIZE]; } BackupState; extern char *build_backup_content(BackupState *state, diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index a00606ffcd..71a9beb05a 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -163,12 +163,16 @@ typedef struct ControlFileData * If backupEndRequired is true, we know for sure that we're restoring * from a backup, and must see a backup-end record before we can safely * start up. + * + * If backupLabelRequired is true, then a backup_label file must be + * present in order for recovery to succeed. */ XLogRecPtr minRecoveryPoint; TimeLineID minRecoveryPointTLI; XLogRecPtr backupStartPoint; XLogRecPtr backupEndPoint; bool backupEndRequired; + bool backupLabelRequired; /* * Parameter settings that determine if the WAL can be used for archival diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 6a5476d3c4..6cb87fb744 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -6463,8 +6463,8 @@ { oid => '2739', descr => 'finish taking an online backup', proname => 'pg_backup_stop', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => 'bool', - proallargtypes => '{bool,pg_lsn,text,text}', proargmodes => '{i,o,o,o}', - proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile}', + proallargtypes => '{bool,pg_lsn,text,text,bytea}', proargmodes => '{i,o,o,o,o}', + proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile,controlfile}', prosrc => 'pg_backup_stop' }, { oid => '3436', descr => 'promote standby server', proname => 'pg_promote', provolatile => 'v', prorettype => 'bool', @@ -11981,9 +11981,9 @@ { oid => '3443', descr => 'pg_controldata recovery state information as a function', proname => 'pg_control_recovery', provolatile => 'v', prorettype => 'record', - proargtypes => '', proallargtypes => '{pg_lsn,int4,pg_lsn,pg_lsn,bool}', - proargmodes => '{o,o,o,o,o}', - proargnames => '{min_recovery_end_lsn,min_recovery_end_timeline,backup_start_lsn,backup_end_lsn,end_of_backup_record_required}', + proargtypes => '', proallargtypes => '{pg_lsn,int4,pg_lsn,pg_lsn,bool,bool}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{min_recovery_end_lsn,min_recovery_end_timeline,backup_start_lsn,backup_end_lsn,end_of_backup_record_required,backup_label_required}', prosrc => 'pg_control_recovery' }, { oid => '3444', diff --git a/src/test/recovery/t/002_archiving.pl b/src/test/recovery/t/002_archiving.pl index a2e012e42d..653546a6c0 100644 --- a/src/test/recovery/t/002_archiving.pl +++ b/src/test/recovery/t/002_archiving.pl @@ -41,6 +41,26 @@ $node_standby->append_conf( archive_cleanup_command = 'echo archive_cleanup_done > $archive_cleanup_command_file' recovery_end_command = 'echo recovery_ended_done > $recovery_end_command_file' )); + +# Rename backup_label to verify that recovery will not start without it +rename("$data_dir/backup_label", "$data_dir/backup_label.tmp") + or BAIL_OUT "could not move $data_dir/backup_label"; + +my $res = run_log( + [ + 'pg_ctl', '-D', $node_standby->data_dir, '-l', + $node_standby->logfile, 'start' + ]); +ok(!$res, 'invalid recovery startup fails'); + +my $logfile = slurp_file($node_standby->logfile()); +ok($logfile =~ qr/could not find backup_label required for recovery/, + 'could not find backup_label required for recovery'); + +# Restore backup_label so recovery proceeds normally +rename("$data_dir/backup_label.tmp", "$data_dir/backup_label") + or BAIL_OUT "could not move $data_dir/backup_label"; + $node_standby->start; # Create some content on primary diff --git a/src/test/recovery/t/042_low_level_backup.pl b/src/test/recovery/t/042_low_level_backup.pl index 61d23187e0..bd3a99960f 100644 --- a/src/test/recovery/t/042_low_level_backup.pl +++ b/src/test/recovery/t/042_low_level_backup.pl @@ -13,6 +13,42 @@ use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +# Decode hex to binary +sub decode_hex +{ + my ($encoded) = @_; + my $decoded; + + $encoded =~ s/^\s+|\s+$//g; + + for (my $idx = 0; $idx < length($encoded); $idx += 2) + { + $decoded .= pack('C', hex(substr($encoded, $idx, 2))); + } + + return $decoded; +} + +# Get backup_label/pg_control from pg_stop_backup() +sub stop_backup_result +{ + my ($psql) = @_; + + my $encoded = $psql->query_safe( + "select encode(labelfile::bytea, 'hex') || ',' || " . + " encode(controlfile, 'hex')" . + " from pg_backup_stop()"); + + my @result; + + foreach my $column (split(',', $encoded)) + { + push(@result, decode_hex($column)); + } + + return @result; +} + # Start primary node with archiving. my $node_primary = PostgreSQL::Test::Cluster->new('primary'); $node_primary->init(has_archiving => 1, allows_streaming => 1); @@ -80,8 +116,7 @@ my $stop_segment_name = $node_primary->safe_psql('postgres', 'SELECT pg_walfile_name(pg_current_wal_lsn())'); # Stop backup and get backup_label, the last segment is archived. -my $backup_label = - $psql->query_safe("select labelfile from pg_backup_stop()"); +(my $backup_label, my $pg_control) = stop_backup_result($psql); $psql->quit; @@ -118,10 +153,36 @@ ok( $node_replica->log_contains( $node_replica->teardown_node; $node_replica->clean_node; +# Save only pg_control into the backup to demonstrate that pg_control returned +# from pg_stop_backup() will only perform recovery when backup_label is present. +open(my $fh, ">", "$backup_dir/global/pg_control") + or die "could not open pg_control"; +binmode($fh); +syswrite($fh, $pg_control); +close($fh); + +$node_replica = PostgreSQL::Test::Cluster->new('replica_fail2'); +$node_replica->init_from_backup($node_primary, $backup_name, + has_restoring => 1); + +my $res = run_log( + [ + 'pg_ctl', '-D', $node_replica->data_dir, '-l', + $node_replica->logfile, 'start' + ]); +ok(!$res, 'invalid recovery startup fails'); + +my $logfile = slurp_file($node_replica->logfile()); +ok($logfile =~ qr/could not find backup_label required for recovery/, + 'could not find backup_label required for recovery'); + +$node_replica->teardown_node; +$node_replica->clean_node; + # Save backup_label into the backup directory and recover using the primary's # archive. This time recovery will succeed and the canary table will be # present. -open my $fh, ">>", "$backup_dir/backup_label" +open $fh, ">>", "$backup_dir/backup_label" or die "could not open backup_label"; # Binary mode is required for Windows, as the backup_label parsing is not # able to cope with CRLFs. -- 2.34.1