From 1ba3c5eeb250d3e755ad5c044aede1816d040ae4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 27 Oct 2020 20:39:57 -0700
Subject: [PATCH v2] Centralize horizon determination for temp tables, fix bug
 due to skew.

Also add tests to verify that temp tables can be pruned despite other
sessions having older snapshots.

Author: Andres Freund
Discussion: https://postgr.es/m/20201014203103.72oke6hqywcyhx7s@alap3.anarazel.de
Discussion: https://postgr.es/m/20201015083735.derdzysdtqdvxshp@alap3.anarazel.de
---
 src/include/commands/cluster.h           |   3 +-
 src/include/commands/vacuum.h            |   1 -
 src/backend/access/heap/vacuumlazy.c     |   1 -
 src/backend/commands/cluster.c           |  28 +--
 src/backend/commands/vacuum.c            |  76 +++---
 src/backend/storage/ipc/procarray.c      |  59 ++++-
 src/test/isolation/expected/horizons.out | 281 +++++++++++++++++++++++
 src/test/isolation/isolation_schedule    |   1 +
 src/test/isolation/specs/horizons.spec   | 165 +++++++++++++
 9 files changed, 541 insertions(+), 74 deletions(-)
 create mode 100644 src/test/isolation/expected/horizons.out
 create mode 100644 src/test/isolation/specs/horizons.spec

diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h
index 1eb144204b6..e05884781b9 100644
--- a/src/include/commands/cluster.h
+++ b/src/include/commands/cluster.h
@@ -19,8 +19,7 @@
 
 
 extern void cluster(ClusterStmt *stmt, bool isTopLevel);
-extern void cluster_rel(Oid tableOid, Oid indexOid, int options,
-						bool isTopLevel);
+extern void cluster_rel(Oid tableOid, Oid indexOid, int options);
 extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid,
 									   bool recheck, LOCKMODE lockmode);
 extern void mark_index_clustered(Relation rel, Oid indexOid, bool is_internal);
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index d9475c99890..a4cd7214009 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -267,7 +267,6 @@ extern void vacuum_set_xid_limits(Relation rel,
 								  int freeze_min_age, int freeze_table_age,
 								  int multixact_freeze_min_age,
 								  int multixact_freeze_table_age,
-								  bool isTopLevel,
 								  TransactionId *oldestXmin,
 								  TransactionId *freezeLimit,
 								  TransactionId *xidFullScanLimit,
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 4f2f38168dc..be5439dd9d8 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -471,7 +471,6 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
 						  params->freeze_table_age,
 						  params->multixact_freeze_min_age,
 						  params->multixact_freeze_table_age,
-						  true, /* we must be a top-level command */
 						  &OldestXmin, &FreezeLimit, &xidFullScanLimit,
 						  &MultiXactCutoff, &mxactFullScanLimit);
 
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 0d647e912c4..04d12a7ece6 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -67,13 +67,10 @@ typedef struct
 } RelToCluster;
 
 
-static void rebuild_relation(Relation OldHeap, Oid indexOid,
-							 bool isTopLevel, bool verbose);
+static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
-							bool isTopLevel, bool verbose,
-							bool *pSwapToastByContent,
-							TransactionId *pFreezeXid,
-							MultiXactId *pCutoffMulti);
+							bool verbose, bool *pSwapToastByContent,
+							TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
 static List *get_tables_to_cluster(MemoryContext cluster_context);
 
 
@@ -173,7 +170,7 @@ cluster(ClusterStmt *stmt, bool isTopLevel)
 		table_close(rel, NoLock);
 
 		/* Do the job. */
-		cluster_rel(tableOid, indexOid, stmt->options, isTopLevel);
+		cluster_rel(tableOid, indexOid, stmt->options);
 	}
 	else
 	{
@@ -222,8 +219,7 @@ cluster(ClusterStmt *stmt, bool isTopLevel)
 			PushActiveSnapshot(GetTransactionSnapshot());
 			/* Do the job. */
 			cluster_rel(rvtc->tableOid, rvtc->indexOid,
-						stmt->options | CLUOPT_RECHECK,
-						isTopLevel);
+						stmt->options | CLUOPT_RECHECK);
 			PopActiveSnapshot();
 			CommitTransactionCommand();
 		}
@@ -254,7 +250,7 @@ cluster(ClusterStmt *stmt, bool isTopLevel)
  * and error messages should refer to the operation as VACUUM not CLUSTER.
  */
 void
-cluster_rel(Oid tableOid, Oid indexOid, int options, bool isTopLevel)
+cluster_rel(Oid tableOid, Oid indexOid, int options)
 {
 	Relation	OldHeap;
 	bool		verbose = ((options & CLUOPT_VERBOSE) != 0);
@@ -404,7 +400,7 @@ cluster_rel(Oid tableOid, Oid indexOid, int options, bool isTopLevel)
 	TransferPredicateLocksToHeapRelation(OldHeap);
 
 	/* rebuild_relation does all the dirty work */
-	rebuild_relation(OldHeap, indexOid, isTopLevel, verbose);
+	rebuild_relation(OldHeap, indexOid, verbose);
 
 	/* NB: rebuild_relation does table_close() on OldHeap */
 
@@ -549,12 +545,11 @@ mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
  *
  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
- * isTopLevel: should be passed down from ProcessUtility.
  *
  * NB: this routine closes OldHeap at the right time; caller should not.
  */
 static void
-rebuild_relation(Relation OldHeap, Oid indexOid, bool isTopLevel, bool verbose)
+rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
 {
 	Oid			tableOid = RelationGetRelid(OldHeap);
 	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
@@ -582,7 +577,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool isTopLevel, bool verbose)
 							   AccessExclusiveLock);
 
 	/* Copy the heap data into the new table in the desired order */
-	copy_table_data(OIDNewHeap, tableOid, indexOid, isTopLevel, verbose,
+	copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
 					&swap_toast_by_content, &frozenXid, &cutoffMulti);
 
 	/*
@@ -733,8 +728,7 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
  */
 static void
-copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
-				bool isTopLevel, bool verbose,
+copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 				bool *pSwapToastByContent, TransactionId *pFreezeXid,
 				MultiXactId *pCutoffMulti)
 {
@@ -832,7 +826,7 @@ copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	 * Since we're going to rewrite the whole table anyway, there's no reason
 	 * not to be aggressive about this.
 	 */
-	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0, isTopLevel,
+	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
 						  NULL);
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index ddeec870d81..1b6717f727e 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -907,8 +907,7 @@ get_all_vacuum_rels(int options)
 /*
  * vacuum_set_xid_limits() -- compute oldestXmin and freeze cutoff points
  *
- * Input parameters are the target relation, applicable freeze age settings,
- * and isTopLevel which should be passed down from ProcessUtility.
+ * Input parameters are the target relation, applicable freeze age settings.
  *
  * The output parameters are:
  * - oldestXmin is the cutoff value used to distinguish whether tuples are
@@ -934,7 +933,6 @@ vacuum_set_xid_limits(Relation rel,
 					  int freeze_table_age,
 					  int multixact_freeze_min_age,
 					  int multixact_freeze_table_age,
-					  bool isTopLevel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
 					  TransactionId *xidFullScanLimit,
@@ -950,53 +948,33 @@ vacuum_set_xid_limits(Relation rel,
 	MultiXactId mxactLimit;
 	MultiXactId safeMxactLimit;
 
-	if (RELATION_IS_LOCAL(rel) && !IsInTransactionBlock(isTopLevel))
-	{
-		/*
-		 * If we are processing a temp relation (which by prior checks must be
-		 * one belonging to our session), and we are not inside any
-		 * transaction block, then there can be no tuples in the rel that are
-		 * still in-doubt, nor can there be any that are dead but possibly
-		 * still interesting to some snapshot our session holds.  We don't
-		 * need to care whether other sessions could see such tuples, either.
-		 * So we can aggressively set the cutoff xmin to be the nextXid.
-		 */
-		*oldestXmin = ReadNewTransactionId();
-	}
-	else
-	{
-		/*
-		 * Otherwise, calculate the cutoff xmin normally.
-		 *
-		 * We can always ignore processes running lazy vacuum.  This is
-		 * because we use these values only for deciding which tuples we must
-		 * keep in the tables.  Since lazy vacuum doesn't write its XID
-		 * anywhere (usually no XID assigned), it's safe to ignore it.  In
-		 * theory it could be problematic to ignore lazy vacuums in a full
-		 * vacuum, but keep in mind that only one vacuum process can be
-		 * working on a particular table at any time, and that each vacuum is
-		 * always an independent transaction.
-		 */
-		*oldestXmin = GetOldestNonRemovableTransactionId(rel);
+	/*
+	 * We can always ignore processes running lazy vacuum.  This is because we
+	 * use these values only for deciding which tuples we must keep in the
+	 * tables.  Since lazy vacuum doesn't write its XID anywhere (usually no
+	 * XID assigned), it's safe to ignore it.  In theory it could be
+	 * problematic to ignore lazy vacuums in a full vacuum, but keep in mind
+	 * that only one vacuum process can be working on a particular table at
+	 * any time, and that each vacuum is always an independent transaction.
+	 */
+	*oldestXmin = GetOldestNonRemovableTransactionId(rel);
 
-		if (OldSnapshotThresholdActive())
+	if (OldSnapshotThresholdActive())
+	{
+		TransactionId limit_xmin;
+		TimestampTz limit_ts;
+
+		if (TransactionIdLimitedForOldSnapshots(*oldestXmin, rel,
+												&limit_xmin, &limit_ts))
 		{
-			TransactionId limit_xmin;
-			TimestampTz limit_ts;
-
-			if (TransactionIdLimitedForOldSnapshots(*oldestXmin, rel,
-													&limit_xmin, &limit_ts))
-			{
-				/*
-				 * TODO: We should only set the threshold if we are pruning on
-				 * the basis of the increased limits.  Not as crucial here as
-				 * it is for opportunistic pruning (which often happens at a
-				 * much higher frequency), but would still be a significant
-				 * improvement.
-				 */
-				SetOldSnapshotThresholdTimestamp(limit_ts, limit_xmin);
-				*oldestXmin = limit_xmin;
-			}
+			/*
+			 * TODO: We should only set the threshold if we are pruning on the
+			 * basis of the increased limits.  Not as crucial here as it is
+			 * for opportunistic pruning (which often happens at a much higher
+			 * frequency), but would still be a significant improvement.
+			 */
+			SetOldSnapshotThresholdTimestamp(limit_ts, limit_xmin);
+			*oldestXmin = limit_xmin;
 		}
 	}
 
@@ -1930,7 +1908,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 			cluster_options |= CLUOPT_VERBOSE;
 
 		/* VACUUM FULL is now a variant of CLUSTER; see cluster.c */
-		cluster_rel(relid, InvalidOid, cluster_options, true);
+		cluster_rel(relid, InvalidOid, cluster_options);
 	}
 	else
 		table_relation_vacuum(onerel, params, vac_strategy);
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 07c5eeb7495..ee4caa51152 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -131,7 +131,7 @@ typedef struct ProcArrayStruct
  * different types of relations. As e.g. a normal user defined table in one
  * database is inaccessible to backends connected to another database, a test
  * specific to a relation can be more aggressive than a test for a shared
- * relation.  Currently we track three different states:
+ * relation.  Currently we track four different states:
  *
  * 1) GlobalVisSharedRels, which only considers an XID's
  *    effects visible-to-everyone if neither snapshots in any database, nor a
@@ -153,6 +153,9 @@ typedef struct ProcArrayStruct
  *    I.e. the difference to GlobalVisCatalogRels is that
  *    replication slot's catalog_xmin is not taken into account.
  *
+ * 4) GlobalVisTempRels, which only considers the current session, as temp
+ *    tables are not visible to other sessions.
+ *
  * GlobalVisTestFor(relation) returns the appropriate state
  * for the relation.
  *
@@ -234,6 +237,13 @@ typedef struct ComputeXidHorizonsResult
 	 * defined tables.
 	 */
 	TransactionId data_oldest_nonremovable;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in this
+	 * session's temporary tables.
+	 */
+	TransactionId temp_oldest_nonremovable;
+
 } ComputeXidHorizonsResult;
 
 
@@ -257,12 +267,13 @@ static TransactionId standbySnapshotPendingXmin;
 
 /*
  * State for visibility checks on different types of relations. See struct
- * GlobalVisState for details. As shared, catalog, and user defined
+ * GlobalVisState for details. As shared, catalog, normal and temporary
  * relations can have different horizons, one such state exists for each.
  */
 static GlobalVisState GlobalVisSharedRels;
 static GlobalVisState GlobalVisCatalogRels;
 static GlobalVisState GlobalVisDataRels;
+static GlobalVisState GlobalVisTempRels;
 
 /*
  * This backend's RecentXmin at the last time the accurate xmin horizon was
@@ -1668,6 +1679,23 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		h->oldest_considered_running = initial;
 		h->shared_oldest_nonremovable = initial;
 		h->data_oldest_nonremovable = initial;
+
+		/*
+		 * Only modifications made by this backend affect the horizon for
+		 * temporary relations. Instead of a check in each iteration of the
+		 * loop over all PGPROCs it is cheaper to just initialize to the
+		 * current top-level xid any.
+		 *
+		 * Without an assigned xid we could use a horizon as agressive as
+		 * ReadNewTransactionid(), but we can get away with the much cheaper
+		 * latestCompletedXid + 1: If this backend has no xid there, by
+		 * definition, can't be any newer changes in the temp table than
+		 * latestCompletedXid.
+		 */
+		if (TransactionIdIsValid(MyProc->xid))
+			h->temp_oldest_nonremovable = MyProc->xid;
+		else
+			h->temp_oldest_nonremovable = initial;
 	}
 
 	/*
@@ -1760,6 +1788,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 			TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin);
 		h->data_oldest_nonremovable =
 			TransactionIdOlder(h->data_oldest_nonremovable, kaxmin);
+		/* temp relations cannot be accessed in recovery */
 	}
 	else
 	{
@@ -1785,6 +1814,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		h->data_oldest_nonremovable =
 			TransactionIdRetreatedBy(h->data_oldest_nonremovable,
 									 vacuum_defer_cleanup_age);
+		/* defer doesn't apply to temp relations */
 	}
 
 	/*
@@ -1844,6 +1874,8 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 										 h->catalog_oldest_nonremovable));
 	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
 										 h->data_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->temp_oldest_nonremovable));
 	Assert(!TransactionIdIsValid(h->slot_xmin) ||
 		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
 										 h->slot_xmin));
@@ -1878,6 +1910,8 @@ GetOldestNonRemovableTransactionId(Relation rel)
 		return horizons.shared_oldest_nonremovable;
 	else if (RelationIsAccessibleInLogicalDecoding(rel))
 		return horizons.catalog_oldest_nonremovable;
+	else if (RELATION_IS_LOCAL(rel))
+		return horizons.temp_oldest_nonremovable;
 	else
 		return horizons.data_oldest_nonremovable;
 }
@@ -2054,8 +2088,8 @@ GetSnapshotDataReuse(Snapshot snapshot)
  *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
  *			older than this are known not running any more.
  *
- * And try to advance the bounds of GlobalVisSharedRels, GlobalVisCatalogRels,
- * GlobalVisDataRels for the benefit of theGlobalVisTest* family of functions.
+ * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels
+ * for the benefit of theGlobalVisTest* family of functions.
  *
  * Note: this function should probably not be called with an argument that's
  * not statically allocated (see xip allocation below).
@@ -2357,6 +2391,15 @@ GetSnapshotData(Snapshot snapshot)
 		GlobalVisDataRels.definitely_needed =
 			FullTransactionIdNewer(def_vis_fxid_data,
 								   GlobalVisDataRels.definitely_needed);
+		/* See temp_oldest_nonremovable computation in ComputeXidHorizons() */
+		if (TransactionIdIsNormal(myxid))
+			GlobalVisTempRels.definitely_needed =
+				FullXidRelativeTo(latest_completed, myxid);
+		else
+		{
+			GlobalVisTempRels.definitely_needed = latest_completed;
+			FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed);
+		}
 
 		/*
 		 * Check if we know that we can initialize or increase the lower
@@ -2375,6 +2418,8 @@ GetSnapshotData(Snapshot snapshot)
 		GlobalVisDataRels.maybe_needed =
 			FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
 								   oldestfxid);
+		/* accurate value known */
+		GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed;
 	}
 
 	RecentXmin = xmin;
@@ -3892,6 +3937,8 @@ GlobalVisTestFor(Relation rel)
 		state = &GlobalVisSharedRels;
 	else if (need_catalog)
 		state = &GlobalVisCatalogRels;
+	else if (RELATION_IS_LOCAL(rel))
+		state = &GlobalVisTempRels;
 	else
 		state = &GlobalVisDataRels;
 
@@ -3942,6 +3989,9 @@ GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
 	GlobalVisDataRels.maybe_needed =
 		FullXidRelativeTo(horizons->latest_completed,
 						  horizons->data_oldest_nonremovable);
+	GlobalVisTempRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->temp_oldest_nonremovable);
 
 	/*
 	 * In longer running transactions it's possible that transactions we
@@ -3957,6 +4007,7 @@ GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
 	GlobalVisDataRels.definitely_needed =
 		FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
 							   GlobalVisDataRels.definitely_needed);
+	GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed;
 
 	ComputeXidHorizonsResultLastXmin = RecentXmin;
 }
diff --git a/src/test/isolation/expected/horizons.out b/src/test/isolation/expected/horizons.out
new file mode 100644
index 00000000000..07bbc9832cd
--- /dev/null
+++ b/src/test/isolation/expected/horizons.out
@@ -0,0 +1,281 @@
+Parsed test spec with 2 sessions
+
+starting permutation: pruner_create_perm ll_start pruner_query_plan pruner_query pruner_query pruner_delete pruner_query pruner_query ll_commit pruner_drop
+step pruner_create_perm: 
+    CREATE TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+
+step ll_start: 
+    BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    SELECT 1;
+
+?column?       
+
+1              
+step pruner_query_plan: 
+    EXPLAIN (COSTS OFF) SELECT * FROM horizons_tst ORDER BY data;
+
+QUERY PLAN     
+
+Index Only Scan using horizons_tst_data_key on horizons_tst
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_delete: 
+    DELETE FROM horizons_tst;
+
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step ll_commit: COMMIT;
+step pruner_drop: 
+    DROP TABLE horizons_tst;
+
+
+starting permutation: pruner_create_temp ll_start pruner_query_plan pruner_query pruner_query pruner_delete pruner_query pruner_query ll_commit pruner_drop
+step pruner_create_temp: 
+    CREATE TEMPORARY TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+
+step ll_start: 
+    BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    SELECT 1;
+
+?column?       
+
+1              
+step pruner_query_plan: 
+    EXPLAIN (COSTS OFF) SELECT * FROM horizons_tst ORDER BY data;
+
+QUERY PLAN     
+
+Index Only Scan using horizons_tst_data_key on horizons_tst
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_delete: 
+    DELETE FROM horizons_tst;
+
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+0              
+step ll_commit: COMMIT;
+step pruner_drop: 
+    DROP TABLE horizons_tst;
+
+
+starting permutation: pruner_create_temp ll_start pruner_query pruner_query pruner_begin pruner_delete pruner_query pruner_query ll_commit pruner_commit pruner_drop
+step pruner_create_temp: 
+    CREATE TEMPORARY TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+
+step ll_start: 
+    BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    SELECT 1;
+
+?column?       
+
+1              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_begin: BEGIN;
+step pruner_delete: 
+    DELETE FROM horizons_tst;
+
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step ll_commit: COMMIT;
+step pruner_commit: COMMIT;
+step pruner_drop: 
+    DROP TABLE horizons_tst;
+
+
+starting permutation: pruner_create_perm ll_start pruner_query pruner_query pruner_delete pruner_vacuum pruner_query pruner_query ll_commit pruner_drop
+step pruner_create_perm: 
+    CREATE TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+
+step ll_start: 
+    BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    SELECT 1;
+
+?column?       
+
+1              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_delete: 
+    DELETE FROM horizons_tst;
+
+step pruner_vacuum: 
+    VACUUM horizons_tst;
+
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step ll_commit: COMMIT;
+step pruner_drop: 
+    DROP TABLE horizons_tst;
+
+
+starting permutation: pruner_create_temp ll_start pruner_query pruner_query pruner_delete pruner_vacuum pruner_query pruner_query ll_commit pruner_drop
+step pruner_create_temp: 
+    CREATE TEMPORARY TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+
+step ll_start: 
+    BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    SELECT 1;
+
+?column?       
+
+1              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+2              
+step pruner_delete: 
+    DELETE FROM horizons_tst;
+
+step pruner_vacuum: 
+    VACUUM horizons_tst;
+
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+0              
+step pruner_query: 
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+
+?column?       
+
+0              
+step ll_commit: COMMIT;
+step pruner_drop: 
+    DROP TABLE horizons_tst;
+
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index aa386ab1a25..f2e752c4454 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -78,6 +78,7 @@ test: timeouts
 test: vacuum-concurrent-drop
 test: vacuum-conflict
 test: vacuum-skip-locked
+test: horizons
 test: predicate-hash
 test: predicate-gist
 test: predicate-gin
diff --git a/src/test/isolation/specs/horizons.spec b/src/test/isolation/specs/horizons.spec
new file mode 100644
index 00000000000..19cc0a086ae
--- /dev/null
+++ b/src/test/isolation/specs/horizons.spec
@@ -0,0 +1,165 @@
+# Test that pruning and vacuuming pay attention to concurrent sessions
+# in the right way. For normal relations that means that rows cannot
+# be pruned away if there's an older snapshot, in contrast to that
+# temporary tables should nearly always be prunable.
+
+setup {
+    CREATE OR REPLACE FUNCTION explain_json(p_query text)
+    RETURNS json
+    LANGUAGE plpgsql AS $$
+        DECLARE
+            v_ret json;
+        BEGIN
+            EXECUTE p_query INTO STRICT v_ret;
+            RETURN v_ret;
+        END;$$;
+}
+
+teardown {
+    DROP FUNCTION explain_json(text);
+}
+
+session "lifeline"
+
+# Start a transaction, force a snapshot to be held
+step "ll_start"
+{
+    BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    SELECT 1;
+}
+
+step "ll_commit" { COMMIT; }
+
+
+session "pruner"
+
+setup
+{
+    SET enable_seqscan = false;
+    SET enable_indexscan = false;
+    SET enable_bitmapscan = false;
+}
+
+step "pruner_create_temp"
+{
+    CREATE TEMPORARY TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+}
+
+step "pruner_create_perm"
+{
+    CREATE TABLE horizons_tst (data int unique) WITH (autovacuum_enabled = off);
+    INSERT INTO horizons_tst(data) VALUES(1),(2);
+}
+
+# Temp tables cannot be dropped in the teardown, so just always do so
+# as part of the permutation
+step "pruner_drop"
+{
+    DROP TABLE horizons_tst;
+}
+
+step "pruner_delete"
+{
+    DELETE FROM horizons_tst;
+}
+
+step "pruner_begin" { BEGIN; }
+step "pruner_commit" { COMMIT; }
+
+step "pruner_vacuum"
+{
+    VACUUM horizons_tst;
+}
+
+# Show the heap fetches of an ordered index-only-scan (other plans
+# have been forbidden above) - that tells us how many non-killed leaf
+# entries there are.
+step "pruner_query"
+{
+    SELECT explain_json($$
+        EXPLAIN (FORMAT json, BUFFERS, ANALYZE)
+	  SELECT * FROM horizons_tst ORDER BY data;$$)->0->'Plan'->'Heap Fetches';
+}
+
+# Verify that the query plan still is an IOS
+step "pruner_query_plan"
+{
+    EXPLAIN (COSTS OFF) SELECT * FROM horizons_tst ORDER BY data;
+}
+
+
+# Show that with a permanent relation deleted rows cannot be pruned
+# away if there's a concurrent session still seeing the rows.
+permutation
+    "pruner_create_perm"
+    "ll_start"
+    "pruner_query_plan"
+    # Run query that could do pruning twice, first has chance to prune,
+    # second would not perform heap fetches if first query did.
+    "pruner_query"
+    "pruner_query"
+    "pruner_delete"
+    "pruner_query"
+    "pruner_query"
+    "ll_commit"
+    "pruner_drop"
+
+# Show that with a temporary relation deleted rows can be pruned away,
+# even if there's a concurrent session with a snapshot from before the
+# deletion. That's safe because the session with the older snapshot
+# cannot access the temporary table.
+permutation
+    "pruner_create_temp"
+    "ll_start"
+    "pruner_query_plan"
+    "pruner_query"
+    "pruner_query"
+    "pruner_delete"
+    "pruner_query"
+    "pruner_query"
+    "ll_commit"
+    "pruner_drop"
+
+# Verify that pruning in temporary relations doesn't remove rows still
+# visible in the current session
+permutation
+    "pruner_create_temp"
+    "ll_start"
+    "pruner_query"
+    "pruner_query"
+    "pruner_begin"
+    "pruner_delete"
+    "pruner_query"
+    "pruner_query"
+    "ll_commit"
+    "pruner_commit"
+    "pruner_drop"
+
+# Show that vacuum cannot remove deleted rows still visible to another
+# session's snapshot, when accessing a permanent table.
+permutation
+    "pruner_create_perm"
+    "ll_start"
+    "pruner_query"
+    "pruner_query"
+    "pruner_delete"
+    "pruner_vacuum"
+    "pruner_query"
+    "pruner_query"
+    "ll_commit"
+    "pruner_drop"
+
+# Show that vacuum can remove deleted rows still visible to another
+# session's snapshot, when accessing a temporary table.
+permutation
+    "pruner_create_temp"
+    "ll_start"
+    "pruner_query"
+    "pruner_query"
+    "pruner_delete"
+    "pruner_vacuum"
+    "pruner_query"
+    "pruner_query"
+    "ll_commit"
+    "pruner_drop"
-- 
2.28.0.651.g306ee63a70

