From f5eca13b8b04760977ab41ef9cd023a47e5cbbbd Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 5 Nov 2024 16:38:19 -0600 Subject: [PATCH v2 3/8] Introduce catalog-swap mode for pg_upgrade. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. This new mode moves the database directories from the old cluster to the new cluster and then swaps the pg_restore-generated catalog files in place. This can significantly increase the length of the following data synchronization step (due to the large number of unsynchronized pg_restore-generated files), but this problem will be handled in follow-up commits. --- src/bin/pg_upgrade/check.c | 2 + src/bin/pg_upgrade/option.c | 5 + src/bin/pg_upgrade/pg_upgrade.h | 1 + src/bin/pg_upgrade/relfilenumber.c | 167 +++++++++++++++++++++++++++++ src/tools/pgindent/typedefs.list | 1 + 5 files changed, 176 insertions(+) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 94164f0472..a4bb365718 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -711,6 +711,8 @@ check_new_cluster(void) case TRANSFER_MODE_LINK: check_hard_link(); break; + case TRANSFER_MODE_CATALOG_SWAP: + break; } check_is_install_user(&new_cluster); diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 6f41d63eed..64091a54c4 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -60,6 +60,7 @@ parseCommandLine(int argc, char *argv[]) {"copy", no_argument, NULL, 2}, {"copy-file-range", no_argument, NULL, 3}, {"sync-method", required_argument, NULL, 4}, + {"catalog-swap", no_argument, NULL, 5}, {NULL, 0, NULL, 0} }; @@ -212,6 +213,10 @@ parseCommandLine(int argc, char *argv[]) user_opts.sync_method = pg_strdup(optarg); break; + case 5: + user_opts.transfer_mode = TRANSFER_MODE_CATALOG_SWAP; + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), os_info.progname); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 53f693c2d4..19cb5a011e 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -256,6 +256,7 @@ typedef enum TRANSFER_MODE_COPY, TRANSFER_MODE_COPY_FILE_RANGE, TRANSFER_MODE_LINK, + TRANSFER_MODE_CATALOG_SWAP, } transferMode; /* diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 07baa49a02..9d8fce3c4a 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -11,11 +11,21 @@ #include +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/int.h" +#include "fe_utils/option_utils.h" #include "pg_upgrade.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); +typedef struct move_catalog_file_context +{ + FileNameMap *maps; + int size; + char *target; +} move_catalog_file_context; /* * transfer_all_new_tablespaces() @@ -41,6 +51,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; + case TRANSFER_MODE_CATALOG_SWAP: + prep_status_progress("Swapping catalog files"); + break; } /* @@ -127,6 +140,144 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, } } +static int +FileNameMapCmp(const void *a, const void *b) +{ + return pg_cmp_u32(((const FileNameMap *) a)->relfilenumber, + ((const FileNameMap *) b)->relfilenumber); +} + +static RelFileNumber +parse_relfilenumber(const char *filename) +{ + char *endp; + unsigned long n; + + if (filename[0] < '1' || filename[0] > '9') + return InvalidRelFileNumber; + + errno = 0; + n = strtoul(filename, &endp, 10); + if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX) + return InvalidRelFileNumber; + + return (RelFileNumber) n; +} + +static int +move_catalog_file(const char *fname, bool isdir, void *arg) +{ + char dst[MAXPGPATH]; + const char *filename = last_dir_separator(fname) + 1; + RelFileNumber rfn = parse_relfilenumber(filename); + move_catalog_file_context *context = (move_catalog_file_context *) arg; + + /* + * XXX: Is this right? AFAICT we don't really expect there to be + * directories within database directories, so perhaps it would be better + * to either unconditionally rename or to fail. Further investigation is + * required. + */ + if (isdir) + return 0; + + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key; + + key.relfilenumber = (RelFileNumber) rfn; + if (bsearch(&key, context->maps, context->size, + sizeof(FileNameMap), FileNameMapCmp)) + return 0; + } + + snprintf(dst, sizeof(dst), "%s/%s", context->target, filename); + if (rename(fname, dst) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", fname, dst); + + return 0; +} + +/* + * XXX: This proof-of-concept patch doesn't yet handle non-default tablespaces. + */ +static void +do_catalog_transfer(FileNameMap *maps, int size, char *old_tablespace) +{ + char old_tblspc[MAXPGPATH]; + char new_tblspc[MAXPGPATH]; + char old_dat[MAXPGPATH]; + char new_dat[MAXPGPATH]; + char moved_tblspc[MAXPGPATH]; + char moved_dat[MAXPGPATH]; + char old_cat[MAXPGPATH]; + move_catalog_file_context context; + DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; + + parse_sync_method(user_opts.sync_method, &sync_method); + + snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", + maps[0].old_tablespace, maps[0].old_tablespace_suffix); + snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", + maps[0].new_tablespace, maps[0].new_tablespace_suffix); + snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, maps[0].db_oid); + snprintf(new_dat, sizeof(new_dat), "%s/%u", new_tblspc, maps[0].db_oid); + snprintf(moved_tblspc, sizeof(moved_tblspc), "%s_moved", old_tblspc); + snprintf(moved_dat, sizeof(moved_dat), "%s/%u", + moved_tblspc, maps[0].db_oid); + snprintf(old_cat, sizeof(old_cat), "%s/%u_old_cat", + moved_tblspc, maps[0].db_oid); + + qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp); + + /* create dir for stuff that is moved aside */ + if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) && errno != EEXIST) + pg_fatal("could not create directory \"%s\": %m", moved_tblspc); + + /* move new cluster data dir aside */ + if (rename(new_dat, moved_dat)) + pg_fatal("could not rename \"%s\" to \"%s\": %m", new_dat, moved_dat); + + /* move old cluster data dir in place */ + if (rename(old_dat, new_dat)) + pg_fatal("could not rename \"%s\" to \"%s\": %m", old_dat, new_dat); + + /* create dir for old catalogs */ + if (pg_mkdir_p(old_cat, pg_dir_create_mode)) + pg_fatal("could not create directory \"%s\": %m", old_cat); + + /* move catalogs in new data dir aside */ + context.maps = maps; + context.size = size; + context.target = old_cat; + walkdir(new_dat, move_catalog_file, false, &context); + + /* move catalogs in moved-aside data dir in place */ + context.target = new_dat; + walkdir(moved_dat, move_catalog_file, false, &context); + + /* no need to sync things individually if we are going to syncfs() later */ + if (sync_method == DATA_DIR_SYNC_METHOD_SYNCFS) + return; + + /* fsync directory entries */ + if (fsync_fname(moved_dat, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", moved_dat); + if (fsync_fname(old_cat, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", old_cat); + + /* + * XXX: We could instead fsync() these directories once at the end instead + * of once per-database, but it doesn't affect performance meaningfully, + * and this is just a proof-of-concept patch, so I haven't bothered doing + * the required refactoring yet. + */ + if (fsync_fname(old_tblspc, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", old_tblspc); + if (fsync_fname(moved_tblspc, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", moved_tblspc); +} + /* * transfer_single_new_db() * @@ -145,6 +296,18 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) vm_must_add_frozenbit = true; + /* + * XXX: In catalog-swap mode, vm_must_add_frozenbit isn't handled yet. We + * could either disallow using catalog-swap mode if the upgrade involves + * versions older than v9.6, or we could add code to handle rewriting the + * visibility maps in this mode (like the other modes do). + */ + if (user_opts.transfer_mode == TRANSFER_MODE_CATALOG_SWAP) + { + do_catalog_transfer(maps, size, old_tablespace); + return; + } + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || @@ -259,6 +422,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); linkFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_CATALOG_SWAP: + pg_fatal("should never happen"); + break; } } } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 2d4c870423..f721f934c0 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3655,6 +3655,7 @@ mix_data_t mixedStruct mode_t movedb_failure_params +move_catalog_file_context multirange_bsearch_comparison multirange_unnest_fctx mxact -- 2.39.5 (Apple Git-154)