From c74d23834f370d158222e32bb1693409ff435b28 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 10 Dec 2021 20:12:26 -0800
Subject: [PATCH v4 1/6] Fix possible HOT corruption when RECENTLY_DEAD changes
 to DEAD while pruning.

Bug: #17255
Reported-By: Alexander Lakhin <exclusion@gmail.com>
Debugged-By: Andres Freund <andres@anarazel.de>
Debugged-By: Peter Geoghegan <pg@bowt.ie>
Author: Andres Freund <andres@andres@anarazel.de>
Discussion: https://postgr.es/m/20211122175914.ayk6gg6nvdwuhrzb@alap3.anarazel.de
Backpatch: 14-
---
 src/backend/access/heap/pruneheap.c | 110 +++++++++++++++++++++-------
 1 file changed, 84 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 522a00af6d1..baf33c9b988 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -56,11 +56,31 @@ typedef struct
 	OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
 	OffsetNumber nowdead[MaxHeapTuplesPerPage];
 	OffsetNumber nowunused[MaxHeapTuplesPerPage];
-	/* marked[i] is true if item i is entered in one of the above arrays */
+
+	/*
+	 * marked[i] is true if item i is entered in one of the above arrays.
+	 *
+	 * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
+	 * 1. Otherwise every access would need to subtract 1.
+	 */
 	bool		marked[MaxHeapTuplesPerPage + 1];
+
+	/*
+	 * Tuple visibility is only computed once for each tuple both for
+	 * correctness and efficiency (see comment in heap_page_prune() for
+	 * details). This isn't of type HTSV_Result[] so we can use -1 to indicate
+	 * no visibility for a tuple has not been computed, e.g. because it * is a
+	 * LP_DEAD item.
+	 *
+	 * Same indexing as ->marked.
+	 */
+	int8		htsv[MaxHeapTuplesPerPage + 1];
 } PruneState;
 
 /* Local functions */
+static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
+											   HeapTuple tup,
+											   Buffer buffer);
 static int	heap_prune_chain(Buffer buffer,
 							 OffsetNumber rootoffnum,
 							 PruneState *prstate);
@@ -252,6 +272,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 	OffsetNumber offnum,
 				maxoff;
 	PruneState	prstate;
+	HeapTupleData tup;
 
 	/*
 	 * Our strategy is to scan the page and make lists of items to change,
@@ -274,8 +295,61 @@ heap_page_prune(Relation relation, Buffer buffer,
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));
 
-	/* Scan the page */
 	maxoff = PageGetMaxOffsetNumber(page);
+	tup.t_tableOid = RelationGetRelid(prstate.rel);
+
+	/*
+	 * Determine HTSV for all tuples.
+	 *
+	 * This is required for correctness to deal with cases where running HTSV
+	 * twice could result in different results (e.g. RECENTLY_DEAD can turn to
+	 * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
+	 * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the
+	 * inserting transaction aborts, ...). That in turn could cause
+	 * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
+	 * once directly via a heap_prune_chain() and once by following a HOT
+	 * chain.
+	 *
+	 * It's also good for performance. Most commonly tuples within a page are
+	 * stored at decreasing offsets (while the items are stored at increasing
+	 * offsets). When processing all tuples on a page this leads to reading
+	 * memory at decreasing offsets within a page, with a variable stride.
+	 * That's hard for CPU prefetchers to deal with. Processing the items in
+	 * reverse order (and thus the tuples in increasing order) increases
+	 * prefetching efficiency significantly / decreases the number of cache
+	 * misses.
+	 */
+	for (offnum = maxoff;
+		 offnum >= FirstOffsetNumber;
+		 offnum = OffsetNumberPrev(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		HeapTupleHeader htup;
+
+		/* Nothing to do if slot doesn't contain a tuple */
+		if (!ItemIdIsNormal(itemid))
+		{
+			prstate.htsv[offnum] = -1;
+			continue;
+		}
+
+		htup = (HeapTupleHeader) PageGetItem(page, itemid);
+		tup.t_data = htup;
+		tup.t_len = ItemIdGetLength(itemid);
+		ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
+
+		/*
+		 * Set the offset number so that we can display it along with any
+		 * error that occurred while processing this tuple.
+		 */
+		if (off_loc)
+			*off_loc = offnum;
+
+		prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
+														   buffer);
+	}
+
+	/* Scan the page */
 	for (offnum = FirstOffsetNumber;
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
@@ -286,10 +360,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 		if (prstate.marked[offnum])
 			continue;
 
-		/*
-		 * Set the offset number so that we can display it along with any
-		 * error that occurred while processing this tuple.
-		 */
+		/* see preceding loop */
 		if (off_loc)
 			*off_loc = offnum;
 
@@ -491,7 +562,7 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
  * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
- * DEAD, the heap_prune_satisfies_vacuum test is just too coarse to detect it.
+ * DEAD, our visibility test is just too coarse to detect it.
  *
  * In general, pruning must never leave behind a DEAD tuple that still has
  * tuple storage.  VACUUM isn't prepared to deal with that case.  That's why
@@ -505,11 +576,7 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
  * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
  * tuple, which we treat as a chain of length 1.)
  *
- * prstate->vistest is used to distinguish whether tuples are DEAD or
- * RECENTLY_DEAD.
- *
- * We don't actually change the page here, except perhaps for hint-bit updates
- * caused by heap_prune_satisfies_vacuum.  We just add entries to the arrays in
+ * We don't actually change the page here. We just add entries to the arrays in
  * prstate showing the changes to be made.  Items to be redirected are added
  * to the redirected[] array (two entries per redirection); items to be set to
  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
@@ -531,9 +598,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 	OffsetNumber chainitems[MaxHeapTuplesPerPage];
 	int			nchain = 0,
 				i;
-	HeapTupleData tup;
-
-	tup.t_tableOid = RelationGetRelid(prstate->rel);
 
 	rootlp = PageGetItemId(dp, rootoffnum);
 
@@ -542,12 +606,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 	 */
 	if (ItemIdIsNormal(rootlp))
 	{
+		Assert(prstate->htsv[rootoffnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
 
-		tup.t_data = htup;
-		tup.t_len = ItemIdGetLength(rootlp);
-		ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum);
-
 		if (HeapTupleHeaderIsHeapOnly(htup))
 		{
 			/*
@@ -568,8 +629,8 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 			 * either here or while following a chain below.  Whichever path
 			 * gets there first will mark the tuple unused.
 			 */
-			if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
-				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+			if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
+				!HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
 				HeapTupleHeaderAdvanceLatestRemovedXid(htup,
@@ -636,12 +697,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 			break;
 
 		Assert(ItemIdIsNormal(lp));
+		Assert(prstate->htsv[offnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, lp);
 
-		tup.t_data = htup;
-		tup.t_len = ItemIdGetLength(lp);
-		ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
-
 		/*
 		 * Check the tuple XMIN against prior XMAX, if any
 		 */
@@ -659,7 +717,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 		 */
 		tupdead = recent_dead = false;
 
-		switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
+		switch ((HTSV_Result) prstate->htsv[offnum])
 		{
 			case HEAPTUPLE_DEAD:
 				tupdead = true;
-- 
2.34.0

