From b7642a47f05e845ea1757ea75c01979d33ef572a Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 22 Nov 2021 09:51:38 -0800
Subject: [PATCH v3 2/3] Fix possible HOT corruption when RECENTLY_DEAD changes
 to DEAD while pruning.

Author:
Reviewed-By:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/access/heap/pruneheap.c           | 71 +++++++++++++++----
 .../expected/prune-recently-dead.out          | 37 +++++-----
 2 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 5c0b60319d8..d304fd3dcdb 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -58,12 +58,18 @@ typedef struct
 	OffsetNumber nowunused[MaxHeapTuplesPerPage];
 	/* marked[i] is true if item i is entered in one of the above arrays */
 	bool		marked[MaxHeapTuplesPerPage + 1];
+
+	/* htsv is only computed once for each tuple */
+	int8		htsv[MaxHeapTuplesPerPage + 1];
 } PruneState;
 
 /* Local functions */
 static int	heap_prune_chain(Buffer buffer,
 							 OffsetNumber rootoffnum,
 							 PruneState *prstate);
+static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
+											   HeapTuple tup,
+											   Buffer buffer);
 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
 static void heap_prune_record_redirect(PruneState *prstate,
 									   OffsetNumber offnum, OffsetNumber rdoffnum);
@@ -252,6 +258,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 	OffsetNumber offnum,
 				maxoff;
 	PruneState	prstate;
+	HeapTupleData tup;
 
 	/*
 	 * Our strategy is to scan the page and make lists of items to change,
@@ -274,6 +281,51 @@ heap_page_prune(Relation relation, Buffer buffer,
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));
 
+	/*
+	 * Determine HTSV for all tuples.
+	 *
+	 * This is required for correctness to deal with cases where running HTSV
+	 * twice could result in different results (e.g. RECENTLY_DEAD can turn to
+	 * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
+	 * to update the horizon, or INSERT_IN_PROGRESS can change to DEAD if the
+	 * inserting transaction aborts). That in turn could cause
+	 * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
+	 * once directly via a heap_prune_chain() and once by following a HOT chain.
+	 *
+	 * It's also good for performance. Most commonly tuples within a page are
+	 * stored at decreasing offsets (while the items are stored at increasing
+	 * offsets). When processing all tuples on a page this leads to reading
+	 * memory at decreasing offsets within a page, with a variable
+	 * stride. That's hard for CPU prefetchers to deal with. Processing the
+	 * items in reverse order (and thus the tuples in increasing order)
+	 * increases prefetching efficiency significantly / decreases the number
+	 * of cache misses.
+	 */
+	tup.t_tableOid = RelationGetRelid(prstate.rel);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = maxoff;
+		 offnum >= FirstOffsetNumber;
+		 offnum = OffsetNumberPrev(offnum))
+	{
+		ItemId		itemid;
+		HeapTupleHeader htup;
+
+		/* Nothing to do if slot doesn't contain tuple */
+		itemid = PageGetItemId(page, offnum);
+		if (!ItemIdIsNormal(itemid))
+		{
+			prstate.htsv[offnum] = -1;
+			continue;
+		}
+
+		htup = (HeapTupleHeader) PageGetItem(page, itemid);
+		tup.t_data = htup;
+		tup.t_len = ItemIdGetLength(itemid);
+		ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
+
+		prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, buffer);
+	}
+
 	/* Scan the page */
 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = FirstOffsetNumber;
@@ -531,9 +583,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 	OffsetNumber chainitems[MaxHeapTuplesPerPage];
 	int			nchain = 0,
 				i;
-	HeapTupleData tup;
-
-	tup.t_tableOid = RelationGetRelid(prstate->rel);
 
 	rootlp = PageGetItemId(dp, rootoffnum);
 
@@ -542,12 +591,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 	 */
 	if (ItemIdIsNormal(rootlp))
 	{
+		Assert(prstate->htsv[rootoffnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
 
-		tup.t_data = htup;
-		tup.t_len = ItemIdGetLength(rootlp);
-		ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum);
-
 		if (HeapTupleHeaderIsHeapOnly(htup))
 		{
 			/*
@@ -568,8 +614,8 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 			 * either here or while following a chain below.  Whichever path
 			 * gets there first will mark the tuple unused.
 			 */
-			if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
-				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+			if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
+				!HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
 				HeapTupleHeaderAdvanceLatestRemovedXid(htup,
@@ -636,12 +682,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 			break;
 
 		Assert(ItemIdIsNormal(lp));
+		Assert(prstate->htsv[offnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, lp);
 
-		tup.t_data = htup;
-		tup.t_len = ItemIdGetLength(lp);
-		ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
-
 		/*
 		 * Check the tuple XMIN against prior XMAX, if any
 		 */
@@ -659,7 +702,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 		 */
 		tupdead = recent_dead = false;
 
-		switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
+		switch (prstate->htsv[offnum])
 		{
 			case HEAPTUPLE_DEAD:
 				tupdead = true;
diff --git a/src/test/isolation/expected/prune-recently-dead.out b/src/test/isolation/expected/prune-recently-dead.out
index 02ef2713d86..e69822c29ce 100644
--- a/src/test/isolation/expected/prune-recently-dead.out
+++ b/src/test/isolation/expected/prune-recently-dead.out
@@ -157,11 +157,11 @@ t      |ShareUpdateExclusiveLock|relation|many_updates
 step s9_rollback: ROLLBACK;
 step vac_vacuum: <... completed>
 step mon_page: SELECT lp, lp_off, lp_flags, lp_len, /* t_xmin, t_xmax, */ t_field3, t_ctid, t_infomask2, t_infomask, mask.raw_flags, mask.combined_flags, t_hoff, t_bits  FROM heap_page_items(get_raw_page('many_updates', 0)), heap_tuple_infomask_flags(t_infomask, t_infomask2) AS mask
-lp|lp_off|lp_flags|lp_len|t_field3|t_ctid|t_infomask2|t_infomask|raw_flags|combined_flags|t_hoff|t_bits
---+------+--------+------+--------+------+-----------+----------+---------+--------------+------+------
- 1|     3|       2|     0|        |      |           |          |         |              |      |      
- 2|     0|       3|     0|        |      |           |          |         |              |      |      
- 3|     0|       0|     0|        |      |           |          |         |              |      |      
+lp|lp_off|lp_flags|lp_len|t_field3|t_ctid|t_infomask2|t_infomask|raw_flags                                                                               |combined_flags|t_hoff|t_bits
+--+------+--------+------+--------+------+-----------+----------+----------------------------------------------------------------------------------------+--------------+------+------
+ 1|     3|       2|     0|        |      |           |          |                                                                                        |              |      |      
+ 2|     0|       3|     0|        |      |           |          |                                                                                        |              |      |      
+ 3|  8152|       1|    36|       0|(0,3) |      40963|      9472|{HEAP_XMIN_COMMITTED,HEAP_XMAX_COMMITTED,HEAP_UPDATED,HEAP_KEYS_UPDATED,HEAP_ONLY_TUPLE}|{}            |    24|      
 (3 rows)
 
 step mon_view: 
@@ -181,10 +181,9 @@ step mon_verify:
   SELECT * FROM verify_heapam('many_updates');
   SELECT * FROM bt_index_parent_check('many_updates_id_idx', true, true);
 
-blkno|offnum|attnum|msg                                                
------+------+------+---------------------------------------------------
-    0|     1|      |line pointer redirection to unused item at offset 3
-(1 row)
+blkno|offnum|attnum|msg
+-----+------+------+---
+(0 rows)
 
 bt_index_parent_check
 ---------------------
@@ -195,22 +194,22 @@ step s1_insert_17:
   INSERT INTO many_updates (id) VALUES(17);
 
 step mon_page: SELECT lp, lp_off, lp_flags, lp_len, /* t_xmin, t_xmax, */ t_field3, t_ctid, t_infomask2, t_infomask, mask.raw_flags, mask.combined_flags, t_hoff, t_bits  FROM heap_page_items(get_raw_page('many_updates', 0)), heap_tuple_infomask_flags(t_infomask, t_infomask2) AS mask
-lp|lp_off|lp_flags|lp_len|t_field3|t_ctid|t_infomask2|t_infomask|raw_flags          |combined_flags|t_hoff|t_bits
---+------+--------+------+--------+------+-----------+----------+-------------------+--------------+------+------
- 1|     3|       2|     0|        |      |           |          |                   |              |      |      
- 2|     0|       3|     0|        |      |           |          |                   |              |      |      
- 3|  8152|       1|    36|       0|(0,3) |          3|      2048|{HEAP_XMAX_INVALID}|{}            |    24|      
-(3 rows)
+lp|lp_off|lp_flags|lp_len|t_field3|t_ctid|t_infomask2|t_infomask|raw_flags                                                                               |combined_flags|t_hoff|t_bits
+--+------+--------+------+--------+------+-----------+----------+----------------------------------------------------------------------------------------+--------------+------+------
+ 1|     3|       2|     0|        |      |           |          |                                                                                        |              |      |      
+ 2|     0|       3|     0|        |      |           |          |                                                                                        |              |      |      
+ 3|  8152|       1|    36|       0|(0,3) |      40963|      9472|{HEAP_XMIN_COMMITTED,HEAP_XMAX_COMMITTED,HEAP_UPDATED,HEAP_KEYS_UPDATED,HEAP_ONLY_TUPLE}|{}            |    24|      
+ 4|  8112|       1|    36|       0|(0,4) |          3|      2048|{HEAP_XMAX_INVALID}                                                                     |{}            |    24|      
+(4 rows)
 
 step s1_select_1: 
   SET LOCAL enable_seqscan = false;
   SELECT ctid, /*xmin, xmax, */ id FROM many_updates WHERE id = 1;
   RESET enable_seqscan;
 
-ctid |id
------+--
-(0,3)|17
-(1 row)
+ctid|id
+----+--
+(0 rows)
 
 step mon_verify: 
   SELECT * FROM verify_heapam('many_updates');
-- 
2.34.0

