From 267867927687279840742b76d58580ac5efb45ea Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 5 Mar 2025 17:36:54 -0600 Subject: [PATCH v6 3/3] pg_upgrade: Add --swap for faster file transfer. This new option instructs pg_upgrade to move the data directories from the old cluster to the new cluster and then to replace the catalog files with those generated for the new cluster. This mode can outperform --link, --clone, --copy, and --copy-file-range, especially on clusters with many relations. However, this mode creates many garbage files in the old cluster, which can prolong the file synchronization step. To handle that, we use "initdb --sync-only --no-sync-data-files" for file synchronization, and we synchronize the catalog files as they are transferred. We assume that the database files transferred from the old cluster were synchronized prior to upgrade. This mode also complicates reverting to the old cluster, so we recommend restoring from backup upon failure during or after file transfer. The new mode is limited to clusters located in the same file system and to upgrades from version 10 and newer. Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan --- doc/src/sgml/ref/pgupgrade.sgml | 59 ++++- src/bin/pg_upgrade/TESTING | 6 +- src/bin/pg_upgrade/check.c | 29 ++- src/bin/pg_upgrade/controldata.c | 21 +- src/bin/pg_upgrade/dump.c | 4 +- src/bin/pg_upgrade/file.c | 14 +- src/bin/pg_upgrade/info.c | 4 +- src/bin/pg_upgrade/meson.build | 1 + src/bin/pg_upgrade/option.c | 7 + src/bin/pg_upgrade/pg_upgrade.c | 16 +- src/bin/pg_upgrade/pg_upgrade.h | 5 +- src/bin/pg_upgrade/relfilenumber.c | 364 +++++++++++++++++++++++++++++ src/bin/pg_upgrade/t/006_swap.pl | 42 ++++ src/common/file_utils.c | 14 +- src/include/common/file_utils.h | 1 + 15 files changed, 553 insertions(+), 34 deletions(-) create mode 100644 src/bin/pg_upgrade/t/006_swap.pl diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 9ef7a84eed0..6deee1607ec 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -244,7 +244,8 @@ PostgreSQL documentation Copy files to the new cluster. This is the default. (See also - and .) + , , + , and .) @@ -262,6 +263,32 @@ PostgreSQL documentation + + + + + Move the data directories from the old cluster to the new cluster. + Then, replace the catalog files with those generated for the new + cluster. This mode can outperform , + , , and + , especially on clusters with many + relations. + + + However, this mode creates many garbage files in the old cluster, which + can prolong the file synchronization step if + is used. Therefore, it is + recommended to use with + . + + + Additionally, once the file transfer step begins, the old cluster will + be destructively modified and therefore will no longer be safe to + start. See for details. + + + + method @@ -530,6 +557,10 @@ NET STOP postgresql-&majorversion; is started. Clone mode also requires that the old and new data directories be in the same file system. This mode is only available on certain operating systems and file systems. + Swap mode may be the fastest if there are many relations, but you will not + be able to access your old cluster once the file transfer step begins. + Swap mode also requires that the old and new cluster data directories be + in the same file system. @@ -888,6 +919,32 @@ psql --username=postgres --file=script.sql postgres + + + + If the option was used, the old cluster might + be destructively modified: + + + + + If pg_upgrade aborts before reporting that the + old cluster is no longer safe to start, the old cluster was + unmodified; it can be restarted. + + + + + + If pg_upgrade has reported that the old cluster + is no longer safe to start, the old cluster was destructively + modified. The old cluster will need to be restored from backup in + this case. + + + + + diff --git a/src/bin/pg_upgrade/TESTING b/src/bin/pg_upgrade/TESTING index 00842ac6ec3..c3d463c9c29 100644 --- a/src/bin/pg_upgrade/TESTING +++ b/src/bin/pg_upgrade/TESTING @@ -20,13 +20,13 @@ export oldinstall=...otherversion/ (old version's install base path) See DETAILS below for more information about creation of the dump. You can also test the different transfer modes (--copy, --link, ---clone, --copy-file-range) by setting the environment variable +--clone, --copy-file-range, --swap) by setting the environment variable PG_TEST_PG_UPGRADE_MODE to the respective command-line option, like make check PG_TEST_PG_UPGRADE_MODE=--link -The default is --copy. Note that the other modes are not supported on -all operating systems. +The default is --copy. Note that not all modes are supported on all +operating systems. DETAILS ------- diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index d32fc3d88ec..81c91fc2912 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -709,7 +709,34 @@ check_new_cluster(void) check_copy_file_range(); break; case TRANSFER_MODE_LINK: - check_hard_link(); + check_hard_link(TRANSFER_MODE_LINK); + break; + case TRANSFER_MODE_SWAP: + + /* + * We do the hard link check for --swap, too, since it's an easy + * way to verify the clusters are in the same file system. This + * allows us to take some shortcuts in the file synchronization + * step. With some more effort, we could probably support the + * separate-file-system use case, but this mode is unlikely to + * offer much benefit if we have to copy the files across file + * system boundaries. + */ + check_hard_link(TRANSFER_MODE_SWAP); + + /* + * There are a few known issues with using --swap to upgrade from + * versions older than 10. For example, the sequence tuple format + * changed in v10, and the visibility map format changed in 9.6. + * While such problems are not insurmountable (and we may have to + * deal with similar problems in the future, anyway), it doesn't + * seem worth the effort to support swap mode for upgrades from + * long-unsupported versions. + */ + if (GET_MAJOR_VERSION(old_cluster.major_version) < 1000) + pg_fatal("Swap mode can only upgrade clusters from PostgreSQL version %s and later.", + "10"); + break; } diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index bd49ea867bf..47ee27ec835 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -751,7 +751,7 @@ check_control_data(ControlData *oldctrl, void -disable_old_cluster(void) +disable_old_cluster(transferMode transfer_mode) { char old_path[MAXPGPATH], new_path[MAXPGPATH]; @@ -766,10 +766,17 @@ disable_old_cluster(void) old_path, new_path); check_ok(); - pg_log(PG_REPORT, "\n" - "If you want to start the old cluster, you will need to remove\n" - "the \".old\" suffix from %s/global/pg_control.old.\n" - "Because \"link\" mode was used, the old cluster cannot be safely\n" - "started once the new cluster has been started.", - old_cluster.pgdata); + if (transfer_mode == TRANSFER_MODE_LINK) + pg_log(PG_REPORT, "\n" + "If you want to start the old cluster, you will need to remove\n" + "the \".old\" suffix from %s/global/pg_control.old.\n" + "Because \"link\" mode was used, the old cluster cannot be safely\n" + "started once the new cluster has been started.", + old_cluster.pgdata); + else if (transfer_mode == TRANSFER_MODE_SWAP) + pg_log(PG_REPORT, "\n" + "Because \"swap\" mode was used, the old cluster can no longer be\n" + "safely started."); + else + pg_fatal("unrecognized transfer mode"); } diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index b8fd0d0acee..23cb08e8347 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -52,9 +52,11 @@ generate_old_dump(void) snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid); parallel_exec_prog(log_file_name, NULL, - "\"%s/pg_dump\" %s --no-data %s --sequence-data --quote-all-identifiers " + "\"%s/pg_dump\" %s --no-data %s %s --quote-all-identifiers " "--binary-upgrade --format=custom %s --no-sync --file=\"%s/%s\" %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + "" : "--sequence-data", log_opts.verbose ? "--verbose" : "", user_opts.do_statistics ? "" : "--no-statistics", log_opts.dumpdir, diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 7fd1991204a..91ed16acb08 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -434,7 +434,7 @@ check_copy_file_range(void) } void -check_hard_link(void) +check_hard_link(transferMode transfer_mode) { char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; @@ -444,8 +444,16 @@ check_hard_link(void) unlink(new_link_file); /* might fail */ if (link(existing_file, new_link_file) < 0) - pg_fatal("could not create hard link between old and new data directories: %m\n" - "In link mode the old and new data directories must be on the same file system."); + { + if (transfer_mode == TRANSFER_MODE_LINK) + pg_fatal("could not create hard link between old and new data directories: %m\n" + "In link mode the old and new data directories must be on the same file system."); + else if (transfer_mode == TRANSFER_MODE_SWAP) + pg_fatal("could not create hard link between old and new data directories: %m\n" + "In swap mode the old and new data directories must be on the same file system."); + else + pg_fatal("unrecognized transfer mode"); + } unlink(new_link_file); } diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index ad52de8b607..4b7a56f5b3b 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -490,7 +490,7 @@ get_rel_infos_query(void) " FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n " " ON c.relnamespace = n.oid " " WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", " - CppAsString2(RELKIND_MATVIEW) ") AND " + CppAsString2(RELKIND_MATVIEW) "%s) AND " /* exclude possible orphaned temp tables */ " ((n.nspname !~ '^pg_temp_' AND " " n.nspname !~ '^pg_toast_temp_' AND " @@ -499,6 +499,8 @@ get_rel_infos_query(void) " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " " relname IN ('pg_largeobject') ))), ", + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + ", " CppAsString2(RELKIND_SEQUENCE) : "", FirstNormalObjectId); /* diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index da84344966a..a4a5eb82690 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -46,6 +46,7 @@ tests += { 't/003_logical_slots.pl', 't/004_subscription.pl', 't/005_char_signedness.pl', + 't/006_swap.pl', ], 'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow }, diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 188dd8d8a8b..7fd7f1d33fc 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -62,6 +62,7 @@ parseCommandLine(int argc, char *argv[]) {"sync-method", required_argument, NULL, 4}, {"no-statistics", no_argument, NULL, 5}, {"set-char-signedness", required_argument, NULL, 6}, + {"swap", no_argument, NULL, 7}, {NULL, 0, NULL, 0} }; @@ -228,6 +229,11 @@ parseCommandLine(int argc, char *argv[]) else pg_fatal("invalid argument for option %s", "--set-char-signedness"); break; + + case 7: + user_opts.transfer_mode = TRANSFER_MODE_SWAP; + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), os_info.progname); @@ -325,6 +331,7 @@ usage(void) printf(_(" --no-statistics do not import statistics from old cluster\n")); printf(_(" --set-char-signedness=OPTION set new cluster char signedness to \"signed\" or\n" " \"unsigned\"\n")); + printf(_(" --swap move data directories to new cluster\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\n" diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 174cd920840..9295e46aed3 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -170,12 +170,14 @@ main(int argc, char **argv) /* * Most failures happen in create_new_objects(), which has completed at - * this point. We do this here because it is just before linking, which - * will link the old and new cluster data files, preventing the old - * cluster from being safely started once the new cluster is started. + * this point. We do this here because it is just before file transfer, + * which for --link will make it unsafe to start the old cluster once the + * new cluster is started, and for --swap will make it unsafe to start the + * old cluster at all. */ - if (user_opts.transfer_mode == TRANSFER_MODE_LINK) - disable_old_cluster(); + if (user_opts.transfer_mode == TRANSFER_MODE_LINK || + user_opts.transfer_mode == TRANSFER_MODE_SWAP) + disable_old_cluster(user_opts.transfer_mode); transfer_all_new_tablespaces(&old_cluster.dbarr, &new_cluster.dbarr, old_cluster.pgdata, new_cluster.pgdata); @@ -212,8 +214,10 @@ main(int argc, char **argv) { prep_status("Sync data directory to disk"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/initdb\" --sync-only \"%s\" --sync-method %s", + "\"%s/initdb\" --sync-only %s \"%s\" --sync-method %s", new_cluster.bindir, + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + "--no-sync-data-files" : "", new_cluster.pgdata, user_opts.sync_method); check_ok(); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 4c9d0172149..69c965bb7d0 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -262,6 +262,7 @@ typedef enum TRANSFER_MODE_COPY, TRANSFER_MODE_COPY_FILE_RANGE, TRANSFER_MODE_LINK, + TRANSFER_MODE_SWAP, } transferMode; /* @@ -391,7 +392,7 @@ void create_script_for_old_cluster_deletion(char **deletion_script_file_name); void get_control_data(ClusterInfo *cluster); void check_control_data(ControlData *oldctrl, ControlData *newctrl); -void disable_old_cluster(void); +void disable_old_cluster(transferMode transfer_mode); /* dump.c */ @@ -423,7 +424,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); void check_file_clone(void); void check_copy_file_range(void); -void check_hard_link(void); +void check_hard_link(transferMode transfer_mode); /* fopen_priv() is no longer different from fopen() */ #define fopen_priv(path, mode) fopen(path, mode) diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 8c23c583172..a87e6156911 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -11,11 +11,92 @@ #include +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/int.h" +#include "common/logging.h" #include "pg_upgrade.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); +/* + * The following set of sync_queue_* functions are used for --swap to reduce + * the amount of time spent synchronizing the swapped catalog files. When a + * file is added to the queue, we also alert the file system that we'd like it + * to be persisted to disk in the near future (if that operation is supported + * by the current platform). Once the queue is full, all of the files are + * synchronized to disk. This strategy should generally be much faster than + * simply calling fsync() on the files right away. + * + * The general usage pattern should be something like: + * + * for (int i = 0; i < num_files; i++) + * sync_queue_push(files[i]); + * + * // be sure to sync any remaining files in the queue + * sync_queue_sync_all(); + * synq_queue_destroy(); + */ + +#define SYNC_QUEUE_MAX_LEN (1024) + +static char *sync_queue[SYNC_QUEUE_MAX_LEN]; +static bool sync_queue_inited; +static int sync_queue_len; + +static inline void +sync_queue_init(void) +{ + if (sync_queue_inited) + return; + + sync_queue_inited = true; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + sync_queue[i] = palloc(MAXPGPATH); +} + +static inline void +sync_queue_sync_all(void) +{ + if (!sync_queue_inited) + return; + + for (int i = 0; i < sync_queue_len; i++) + { + if (fsync_fname(sync_queue[i], false) != 0) + pg_fatal("could not synchronize file \"%s\": %m", sync_queue[i]); + } + + sync_queue_len = 0; +} + +static inline void +sync_queue_push(const char *fname) +{ + sync_queue_init(); + + pre_sync_fname(fname, false); + + strncpy(sync_queue[sync_queue_len++], fname, MAXPGPATH); + if (sync_queue_len >= SYNC_QUEUE_MAX_LEN) + sync_queue_sync_all(); +} + +static inline void +sync_queue_destroy(void) +{ + if (!sync_queue_inited) + return; + + sync_queue_inited = false; + sync_queue_len = 0; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + { + pfree(sync_queue[i]); + sync_queue[i] = NULL; + } +} /* * transfer_all_new_tablespaces() @@ -41,6 +122,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; + case TRANSFER_MODE_SWAP: + prep_status_progress("Swapping data directories"); + break; } /* @@ -125,6 +209,267 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, /* We allocate something even for n_maps == 0 */ pg_free(mappings); } + + /* + * Make sure anything pending synchronization in swap mode is fully + * persisted to disk. This is a no-op for other transfer modes. + */ + sync_queue_sync_all(); + sync_queue_destroy(); +} + +/* + * prepare_for_swap() + * + * This function durably moves the database directory from the old cluster to + * the new cluster in preparation for moving the pg_restore-generated catalog + * files into place. Returns false if the database with the given OID does not + * have a directory in the given tablespace, otherwise returns true. + * + * old_cat (the directory for the old catalog files), new_dat (the database + * directory in the new cluster), and moved_dat (the destination for the + * pg_restore-generated database directory) should be sized to MAXPGPATH bytes. + * This function will return the appropriate paths in those variables. + */ +static bool +prepare_for_swap(const char *old_tablespace, Oid db_oid, + char *old_cat, char *new_dat, char *moved_dat) +{ + const char *new_tablespace; + const char *old_tblspc_suffix; + const char *new_tblspc_suffix; + char old_tblspc[MAXPGPATH]; + char new_tblspc[MAXPGPATH]; + char moved_tblspc[MAXPGPATH]; + char old_dat[MAXPGPATH]; + struct stat st; + + if (strcmp(old_tablespace, old_cluster.pgdata) == 0) + { + new_tablespace = new_cluster.pgdata; + new_tblspc_suffix = "/base"; + old_tblspc_suffix = "/base"; + } + else + { + new_tablespace = old_tablespace; + new_tblspc_suffix = new_cluster.tablespace_suffix; + old_tblspc_suffix = old_cluster.tablespace_suffix; + } + + /* Old and new cluster paths. */ + snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, old_tblspc_suffix); + snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", new_tablespace, new_tblspc_suffix); + snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, db_oid); + snprintf(new_dat, MAXPGPATH, "%s/%u", new_tblspc, db_oid); + + /* + * Paths for "moved aside" stuff. We intentionally put these in the old + * cluster so that the delete_old_cluster.{sh,bat} script handles them. + */ + snprintf(moved_tblspc, sizeof(moved_tblspc), "%s/moved_for_upgrade", old_tblspc); + snprintf(old_cat, MAXPGPATH, "%s/%u_old_catalogs", moved_tblspc, db_oid); + snprintf(moved_dat, MAXPGPATH, "%s/%u", moved_tblspc, db_oid); + + /* Check that the database directory exists in the given tablespace. */ + if (stat(old_dat, &st) != 0) + { + if (errno != ENOENT) + pg_fatal("could not stat file \"%s\": %m", old_dat); + return false; + } + + /* Create directory for stuff that is moved aside. */ + if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) != 0 && errno != EEXIST) + pg_fatal("could not create directory \"%s\"", moved_tblspc); + + /* Create directory for old catalog files. */ + if (pg_mkdir_p(old_cat, pg_dir_create_mode) != 0) + pg_fatal("could not create directory \"%s\"", old_cat); + + /* Move the new cluster's database directory aside. */ + if (rename(new_dat, moved_dat) != 0) + pg_fatal("could not rename \"%s\" to \"%s\"", new_dat, moved_dat); + + /* Move the old cluster's database directory into place. */ + if (rename(old_dat, new_dat) != 0) + pg_fatal("could not rename \"%s\" to \"%s\"", old_dat, new_dat); + + return true; +} + +/* + * FileNameMapCmp() + * + * qsort() comparator for FileNameMap that sorts by RelFileNumber. + */ +static int +FileNameMapCmp(const void *a, const void *b) +{ + const FileNameMap *map1 = (const FileNameMap *) a; + const FileNameMap *map2 = (const FileNameMap *) b; + + return pg_cmp_u32(map1->relfilenumber, map2->relfilenumber); +} + +/* + * parse_relfilenumber() + * + * Attempt to parse the RelFileNumber of the given file name. If we can't, + * return InvalidRelFileNumber. Note that this code snippet is lifted from + * parse_filename_for_nontemp_relation(). + */ +static RelFileNumber +parse_relfilenumber(const char *filename) +{ + char *endp; + unsigned long n; + + if (filename[0] < '1' || filename[0] > '9') + return InvalidRelFileNumber; + + errno = 0; + n = strtoul(filename, &endp, 10); + if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX) + return InvalidRelFileNumber; + + return (RelFileNumber) n; +} + +/* + * swap_catalog_files() + * + * Moves the old catalog files aside, and moves the new catalog files into + * place. + */ +static void +swap_catalog_files(FileNameMap *maps, int size, const char *old_cat, + const char *new_dat, const char *moved_dat) +{ + DIR *dir; + struct dirent *de; + char path[MAXPGPATH]; + char dest[MAXPGPATH]; + RelFileNumber rfn; + + /* Move the old catalog files aside. */ + dir = opendir(new_dat); + if (dir == NULL) + pg_fatal("could not open directory \"%s\": %m", new_dat); + while (errno = 0, (de = readdir(dir)) != NULL) + { + snprintf(path, sizeof(path), "%s/%s", new_dat, de->d_name); + if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG) + continue; + + rfn = parse_relfilenumber(de->d_name); + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key = {.relfilenumber = rfn}; + + if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp)) + continue; + } + + snprintf(dest, sizeof(dest), "%s/%s", old_cat, de->d_name); + if (rename(path, dest) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + } + if (errno) + pg_fatal("could not read directory \"%s\": %m", new_dat); + (void) closedir(dir); + + /* Move the new catalog files into place. */ + dir = opendir(moved_dat); + if (dir == NULL) + pg_fatal("could not open directory \"%s\": %m", moved_dat); + while (errno = 0, (de = readdir(dir)) != NULL) + { + snprintf(path, sizeof(path), "%s/%s", moved_dat, de->d_name); + if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG) + continue; + + rfn = parse_relfilenumber(de->d_name); + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key = {.relfilenumber = rfn}; + + if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp)) + continue; + } + + snprintf(dest, sizeof(dest), "%s/%s", new_dat, de->d_name); + if (rename(path, dest) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + + /* + * We don't fsync() the database files in the file synchronization + * stage of pg_upgrade in swap mode, so we need to synchronize them + * ourselves. We only do this for the catalog files because they were + * created during pg_restore with fsync=off. We assume that the user + * data files files were properly persisted to disk when the user last + * shut it down. + */ + if (user_opts.do_sync) + sync_queue_push(dest); + } + if (errno) + pg_fatal("could not read directory \"%s\": %m", moved_dat); + (void) closedir(dir); + + /* Ensure the directory entries are persisted to disk. */ + if (fsync_fname(new_dat, true) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", new_dat); + if (fsync_parent_path(new_dat) != 0) + pg_fatal("could not synchronize parent directory of \"%s\": %m", new_dat); +} + +/* + * do_swap() + * + * Perform the required steps for --swap for a single database. In short this + * moves the old cluster's database directory into the new cluster and then + * replaces any files for system catalogs with the ones that were generated + * during pg_restore. + */ +static void +do_swap(FileNameMap *maps, int size, char *old_tablespace) +{ + char old_cat[MAXPGPATH]; + char new_dat[MAXPGPATH]; + char moved_dat[MAXPGPATH]; + + /* + * We perform many lookups on maps by relfilenumber in swap mode, so make + * sure it's sorted by relfilenumber. maps should already be sorted by + * OID, so in general this shouldn't have much work to do. + */ + qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp); + + /* + * If an old tablespace is given, we only need to process that one. If no + * old tablespace is specified, we need to process all the tablespaces on + * the system. + */ + if (old_tablespace) + { + if (prepare_for_swap(old_tablespace, maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + } + else + { + if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + + for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + { + if (prepare_for_swap(os_info.old_tablespaces[tblnum], maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + } + } } /* @@ -145,6 +490,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) vm_must_add_frozenbit = true; + /* --swap has its own subroutine */ + if (user_opts.transfer_mode == TRANSFER_MODE_SWAP) + { + /* + * We don't support --swap to upgrade from versions that require + * rewriting the visibility map. We should've failed already if + * someone tries to do that. + */ + Assert(!vm_must_add_frozenbit); + + do_swap(maps, size, old_tablespace); + return; + } + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || @@ -259,6 +618,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); linkFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_SWAP: + /* swap mode is handled in its own code path */ + pg_fatal("should never happen"); + break; } } } diff --git a/src/bin/pg_upgrade/t/006_swap.pl b/src/bin/pg_upgrade/t/006_swap.pl new file mode 100644 index 00000000000..5ab0cc1dc00 --- /dev/null +++ b/src/bin/pg_upgrade/t/006_swap.pl @@ -0,0 +1,42 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Tests for --swap + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize old and new clusters +my $old = PostgreSQL::Test::Cluster->new('old'); +my $new = PostgreSQL::Test::Cluster->new('new'); +$old->init(); +$new->init(); + +$old->start; +$old->safe_psql('postgres', "CREATE TABLE test AS SELECT generate_series(1, 5432)"); +$old->stop; + +# pg_upgrade should be successful. +command_ok( + [ + 'pg_upgrade', '--no-sync', + '--old-datadir' => $old->data_dir, + '--new-datadir' => $new->data_dir, + '--old-bindir' => $old->config_data('--bindir'), + '--new-bindir' => $new->config_data('--bindir'), + '--socketdir' => $new->host, + '--old-port' => $old->port, + '--new-port' => $new->port, + '--swap' + ], + 'run of pg_upgrade --swap'); + +$new->start; +my $result = $new->safe_psql('postgres', "SELECT COUNT(*) FROM test"); +is($result, '5432', 'table data after pg_upgrade --swap'); +$new->stop; + +done_testing(); diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 78e272916f5..4405ef8b425 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -45,9 +45,6 @@ */ #define MINIMUM_VERSION_FOR_PG_WAL 100000 -#ifdef PG_FLUSH_DATA_WORKS -static int pre_sync_fname(const char *fname, bool isdir); -#endif static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), bool process_symlinks, @@ -352,16 +349,16 @@ walkdir(const char *path, } /* - * Hint to the OS that it should get ready to fsync() this file. + * Hint to the OS that it should get ready to fsync() this file, if supported + * by the platform. * * Ignores errors trying to open unreadable files, and reports other errors * non-fatally. */ -#ifdef PG_FLUSH_DATA_WORKS - -static int +int pre_sync_fname(const char *fname, bool isdir) { +#ifdef PG_FLUSH_DATA_WORKS int fd; fd = open(fname, O_RDONLY | PG_BINARY, 0); @@ -388,11 +385,10 @@ pre_sync_fname(const char *fname, bool isdir) #endif (void) close(fd); +#endif /* PG_FLUSH_DATA_WORKS */ return 0; } -#endif /* PG_FLUSH_DATA_WORKS */ - /* * fsync_fname -- Try to fsync a file or directory * diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 8274bc877ab..9fd88953e43 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -33,6 +33,7 @@ typedef enum DataDirSyncMethod struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND +extern int pre_sync_fname(const char *fname, bool isdir); extern int fsync_fname(const char *fname, bool isdir); extern void sync_pgdata(const char *pg_data, int serverVersion, DataDirSyncMethod sync_method, bool sync_data_files); -- 2.39.5 (Apple Git-154)