diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 69d6924b3a..7f752b2ed0 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -334,15 +334,6 @@ GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, b
-
- When executing pg_rewind using an online
- cluster as source which has been recently promoted, it is necessary
- to execute a CHECKPOINT after promotion such that its
- control file reflects up-to-date timeline information, which is used by
- pg_rewind to check if the target cluster
- can be rewound using the designated source cluster.
-
-
How It Works
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 6cb288f099..2a407da1e4 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -309,9 +309,11 @@ sync_target_dir(void)
* buffer is actually *filesize + 1. That's handy when reading a text file.
* This function can be used to read binary files as well, you can just
* ignore the zero-terminator in that case.
+ *
+ * If noerror is true, returns NULL when the file is not found.
*/
char *
-slurpFile(const char *datadir, const char *path, size_t *filesize)
+slurpFile(const char *datadir, const char *path, size_t *filesize, bool noerror)
{
int fd;
char *buffer;
@@ -323,8 +325,13 @@ slurpFile(const char *datadir, const char *path, size_t *filesize)
snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path);
if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) == -1)
+ {
+ if (noerror && errno == ENOENT)
+ return NULL;
+
pg_fatal("could not open file \"%s\" for reading: %m",
fullpath);
+ }
if (fstat(fd, &statbuf) < 0)
pg_fatal("could not open file \"%s\" for reading: %m",
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index 54a853bd42..92e19042cb 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -21,7 +21,8 @@ extern void create_target(file_entry_t *t);
extern void remove_target(file_entry_t *t);
extern void sync_target_dir(void);
-extern char *slurpFile(const char *datadir, const char *path, size_t *filesize);
+extern char *slurpFile(const char *datadir, const char *path, size_t *filesize,
+ bool noerror);
typedef void (*process_file_callback_t) (const char *path, file_type_t type, size_t size, const char *link_target);
extern void traverse_datadir(const char *datadir, process_file_callback_t callback);
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..751c96e6e4 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source, const char *path,
off_t off, size_t len);
static void libpq_finish_fetch(rewind_source *source);
static char *libpq_fetch_file(rewind_source *source, const char *path,
- size_t *filesize);
+ size_t *filesize, bool noerror);
static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source);
static void libpq_destroy(rewind_source *source);
@@ -620,9 +620,12 @@ appendArrayEscapedString(StringInfo buf, const char *str)
/*
* Fetch a single file as a malloc'd buffer.
+ *
+ * If noerror is true, returns NULL if pg_read_binary_file() failed.
*/
static char *
-libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+ bool noerror)
{
PGconn *conn = ((libpq_source *) source)->conn;
PGresult *res;
@@ -631,6 +634,34 @@ libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
const char *paramValues[1];
paramValues[0] = path;
+
+ /*
+ * check the existence of the file. We do this before executing
+ * pg_read_binary_file so that server doesn't emit an error
+ */
+ if (noerror)
+ {
+ res = PQexecParams(conn, "SELECT pg_stat_file($1, true)",
+ 1, NULL, paramValues, NULL, NULL, 1);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ pg_fatal("could not stat remote file \"%s\": %s",
+ path, PQresultErrorMessage(res));
+ }
+
+ /* sanity check the result set */
+ if (PQntuples(res) != 1)
+ pg_fatal("unexpected result set while stating remote file \"%s\"",
+ path);
+
+ /* Return NULL if the file was not found */
+ if (PQgetisnull(res, 0, 0))
+ return NULL;
+
+ PQclear(res);
+ }
+
res = PQexecParams(conn, "SELECT pg_read_binary_file($1)",
1, NULL, paramValues, NULL, NULL, 1);
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c39..fc2e1e9f11 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -28,7 +28,7 @@ typedef struct
static void local_traverse_files(rewind_source *source,
process_file_callback_t callback);
static char *local_fetch_file(rewind_source *source, const char *path,
- size_t *filesize);
+ size_t *filesize, bool noerror);
static void local_queue_fetch_file(rewind_source *source, const char *path,
size_t len);
static void local_queue_fetch_range(rewind_source *source, const char *path,
@@ -63,9 +63,11 @@ local_traverse_files(rewind_source *source, process_file_callback_t callback)
}
static char *
-local_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+local_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+ bool noerror)
{
- return slurpFile(((local_source *) source)->datadir, path, filesize);
+ return slurpFile(((local_source *) source)->datadir, path, filesize,
+ noerror);
}
/*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 1ff8da1676..221e97c5d8 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -43,6 +43,8 @@ static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
static void digestControlFile(ControlFileData *ControlFile,
const char *content, size_t size);
+static TimeLineHistoryEntry *getTimelineHistory(ControlFileData *controlFile,
+ int *nentries);
static void getRestoreCommand(const char *argv0);
static void sanityChecks(void);
static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
@@ -70,9 +72,13 @@ bool do_sync = true;
bool restore_wal = false;
/* Target history */
-TimeLineHistoryEntry *targetHistory;
+TimeLineHistoryEntry *targetHistory = NULL;
int targetNentries;
+/* Source history */
+TimeLineHistoryEntry *sourceHistory = NULL;
+int sourceNentries;
+
/* Progress counters */
uint64 fetch_size;
uint64 fetch_done;
@@ -141,6 +147,7 @@ main(int argc, char **argv)
bool rewind_needed;
bool writerecoveryconf = false;
filemap_t *filemap;
+ TimeLineID source_tli;
pg_logging_init(argv[0]);
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
@@ -311,7 +318,7 @@ main(int argc, char **argv)
* need to make sure by themselves that the target cluster is in a clean
* state.
*/
- buffer = slurpFile(datadir_target, "global/pg_control", &size);
+ buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
digestControlFile(&ControlFile_target, buffer, size);
pg_free(buffer);
@@ -321,25 +328,43 @@ main(int argc, char **argv)
{
ensureCleanShutdown(argv[0]);
- buffer = slurpFile(datadir_target, "global/pg_control", &size);
+ buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
digestControlFile(&ControlFile_target, buffer, size);
pg_free(buffer);
}
- buffer = source->fetch_file(source, "global/pg_control", &size);
+ buffer = source->fetch_file(source, "global/pg_control", &size, false);
digestControlFile(&ControlFile_source, buffer, size);
pg_free(buffer);
sanityChecks();
+ /* Retrieve timelines for both source and target */
+ sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
+ targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
+
+ /*
+ * If the source just has been promoted but the end-of-recovery checkpoint
+ * has not completed, the soruce control file has a bit older content for
+ * identifying the source's timeline. Instead, look into timeline history,
+ * which is always refreshed up-to-date.
+ */
+ source_tli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+
+ if (sourceHistory[sourceNentries - 1].tli > source_tli)
+ {
+ pg_log_info("source's actual timeline ID (%d) is newer than control file (%d)",
+ sourceHistory[sourceNentries - 1].tli, source_tli);
+ source_tli = sourceHistory[sourceNentries - 1].tli;
+ }
+
/*
* Find the common ancestor timeline between the clusters.
*
* If both clusters are already on the same timeline, there's nothing to
* do.
*/
- if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
- ControlFile_source.checkPointCopy.ThisTimeLineID)
+ if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
{
pg_log_info("source and target cluster are on the same timeline");
rewind_needed = false;
@@ -581,7 +606,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
* Fetch the control file from the source last. This ensures that the
* minRecoveryPoint is up-to-date.
*/
- buffer = source->fetch_file(source, "global/pg_control", &size);
+ buffer = source->fetch_file(source, "global/pg_control", &size, false);
digestControlFile(&ControlFile_source_after, buffer, size);
pg_free(buffer);
@@ -654,7 +679,22 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
pg_fatal("source system was in unexpected state at end of rewind");
endrec = source->get_current_wal_insert_lsn(source);
- endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+
+ /*
+ * Find the timeline ID corresponding to endrec on the source.
+ *
+ * In most cases we can use the TLI in the source control file, but
+ * that is not the case after promotion until end-of-recovery
+ * checkpoint completes, where the control file is a bit old for
+ * this purpose. It should be the latest timeline in the source's
+ * history file.
+ */
+ if (!((sourceHistory[sourceNentries - 1].begin == 0 ||
+ sourceHistory[sourceNentries - 1].begin <= endrec) &&
+ sourceHistory[sourceNentries - 1].end == 0))
+ pg_fatal("source server's current insert LSN was not on the latest timeline in history file");
+
+ endtli = sourceHistory[sourceNentries - 1].tli;
}
}
else
@@ -796,46 +836,83 @@ MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
}
/*
- * Retrieve timeline history for given control file which should behold
- * either source or target.
+ * Retrieve the latest timeline history for given control file which should
+ * behold either source or target.
+ *
+ * This works on the assumption that the timeline IDs of existing history files
+ * are contiguous up to the latest history file. This is true for
+ * recently-promoted servers. See findNewestTimeLine() for this assumption.
*/
static TimeLineHistoryEntry *
getTimelineHistory(ControlFileData *controlFile, int *nentries)
{
- TimeLineHistoryEntry *history;
+ TimeLineHistoryEntry *history = NULL;
TimeLineID tli;
+ TimeLineID probe_tli;
tli = controlFile->checkPointCopy.ThisTimeLineID;
+ Assert(tli > 0);
+
+ /* Probe history files */
+ for (probe_tli = tli ;; probe_tli++)
+ {
+ char path[MAXPGPATH];
+ char *histfile;
+ TimeLineHistoryEntry *tmphistory;
+ int nent;
+ int i;
+
+ if (probe_tli < 2)
+ continue;
+
+ TLHistoryFilePath(path, probe_tli);
+
+ /* Get history file from appropriate source */
+ if (controlFile == &ControlFile_source)
+ histfile = source->fetch_file(source, path, NULL, true);
+ else if (controlFile == &ControlFile_target)
+ histfile = slurpFile(datadir_target, path, NULL, true);
+ else
+ pg_fatal("invalid control file");
+
+ /* no such history file, exit */
+ if (!histfile)
+ break;
+
+ /* preserve the history if we're part of it */
+ tmphistory = rewind_parseTimeLineHistory(histfile, probe_tli, &nent);
+ pg_free(histfile);
+
+ for (i = 0 ; i < nent ; i++)
+ {
+ if (tmphistory[i].tli == tli)
+ {
+ if (history)
+ pg_free(history);
+
+ history = tmphistory;
+ *nentries = nent;
+ break;
+ }
+ }
+ if (tmphistory != history)
+ pg_free(tmphistory);
+ }
+
/*
* Timeline 1 does not have a history file, so there is no need to check
* and fake an entry with infinite start and end positions.
*/
- if (tli == 1)
+ if (!history)
{
+ Assert (tli == 1);
+
history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
history->tli = tli;
history->begin = history->end = InvalidXLogRecPtr;
*nentries = 1;
}
- else
- {
- char path[MAXPGPATH];
- char *histfile;
-
- TLHistoryFilePath(path, tli);
-
- /* Get history file from appropriate source */
- if (controlFile == &ControlFile_source)
- histfile = source->fetch_file(source, path, NULL);
- else if (controlFile == &ControlFile_target)
- histfile = slurpFile(datadir_target, path, NULL);
- else
- pg_fatal("invalid control file");
-
- history = rewind_parseTimeLineHistory(histfile, tli, nentries);
- pg_free(histfile);
- }
if (debug)
{
@@ -879,15 +956,9 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
static void
findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
{
- TimeLineHistoryEntry *sourceHistory;
- int sourceNentries;
int i,
n;
- /* Retrieve timelines for both source and target */
- sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
- targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
-
/*
* Trace the history forward, until we hit the timeline diverge. It may
* still be possible that the source and target nodes used the same
@@ -910,7 +981,6 @@ findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
*tliIndex = i;
- pg_free(sourceHistory);
return;
}
else
diff --git a/src/bin/pg_rewind/rewind_source.h b/src/bin/pg_rewind/rewind_source.h
index 1310e86e75..6975848668 100644
--- a/src/bin/pg_rewind/rewind_source.h
+++ b/src/bin/pg_rewind/rewind_source.h
@@ -35,7 +35,7 @@ typedef struct rewind_source
* handy for text files.
*/
char *(*fetch_file) (struct rewind_source *, const char *path,
- size_t *filesize);
+ size_t *filesize, bool noerror);
/*
* Request to fetch (part of) a file in the source system, specified by an