From 42044b5745213b8a9ebba487e5e8b7ac23d1e3e7 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 10 Jun 2024 15:49:07 +0200
Subject: [PATCH v2 1/2] Fix creation of partition descriptor during concurrent
 detach

When a partition is being detached in concurrent mode, it is possible
for find_inheritance_children_extended() to return that partition in the
list, and immediately after that receive an invalidation message that
sets its relpartbound to NULL just before we read it.  (This can happen
because table_open() reads invalidation messages.)  Currently we raise
an error
  ERROR:  missing relpartbound for relation %u
about the situation, but that's bogus.  We can cope better by retrying
the find_inheritance_children_extended call to get a new list, which
will no longer have the partition being detached.

Noticed while investigating bug #18377.

Backpatch to 14, where DETACH CONCURRENTLY appeared.

Discussion: https://postgr.es/m/202405201616.y4ht2qe5ihoy@alvherre.pgsql
---
 src/backend/partitioning/partdesc.c | 54 ++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
index 47c97566e6..b8c196536e 100644
--- a/src/backend/partitioning/partdesc.c
+++ b/src/backend/partitioning/partdesc.c
@@ -24,6 +24,7 @@
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
 #include "utils/hsearch.h"
+#include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/partcache.h"
@@ -144,16 +145,19 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
 	ListCell   *cell;
 	int			i,
 				nparts;
+	bool		retried = false;
 	PartitionKey key = RelationGetPartitionKey(rel);
 	MemoryContext new_pdcxt;
 	MemoryContext oldcxt;
 	int		   *mapping;
 
+retry:
+
 	/*
 	 * Get partition oids from pg_inherits.  This uses a single snapshot to
 	 * fetch the list of children, so while more children may be getting added
-	 * concurrently, whatever this function returns will be accurate as of
-	 * some well-defined point in time.
+	 * or removed concurrently, whatever this function returns will be
+	 * accurate as of some well-defined point in time.
 	 */
 	detached_exist = false;
 	detached_xmin = InvalidTransactionId;
@@ -196,18 +200,27 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
 		}
 
 		/*
-		 * The system cache may be out of date; if so, we may find no pg_class
-		 * tuple or an old one where relpartbound is NULL.  In that case, try
-		 * the table directly.  We can't just AcceptInvalidationMessages() and
-		 * retry the system cache lookup because it's possible that a
-		 * concurrent ATTACH PARTITION operation has removed itself from the
-		 * ProcArray but not yet added invalidation messages to the shared
-		 * queue; InvalidateSystemCaches() would work, but seems excessive.
+		 * Two problems are possible here.  First, a concurrent ATTACH
+		 * PARTITION might be in the process of adding a new partition, but
+		 * the syscache doesn't have it, or its copy of it does not yet have
+		 * its relpartbound set.  We cannot just AcceptInvalidationMessages(),
+		 * because the other process might have already removed itself from
+		 * the ProcArray but not yet added its invalidation messages to the
+		 * shared queue.  We solve this problem by reading pg_class directly
+		 * for the desired tuple.
 		 *
-		 * Note that this algorithm assumes that PartitionBoundSpec we manage
-		 * to fetch is the right one -- so this is only good enough for
-		 * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
-		 * some hypothetical operation that changes the partition bounds.
+		 * The other problem is that DETACH CONCURRENTLY is in the process of
+		 * removing a partition, which happens in two steps: first it marks it
+		 * as "detach pending", commits, then unsets relpartbound.  If
+		 * find_inheritance_children_extended included that partition but we
+		 * below we see that DETACH CONCURRENTLY has reset relpartbound for
+		 * it, we'd see an inconsistent view.  (The inconsistency is seen
+		 * because table_open below reads invalidation messages.)  We protect
+		 * against this by doing the find_inheritance_children_extended again.
+		 * We only need to do it once, because only one DETACH CONCURRENTLY
+		 * can run at one time, and it has a wait-for-snapshots phase which
+		 * would wait for us, so we cannot be affected by more than one of
+		 * them.
 		 */
 		if (boundspec == NULL)
 		{
@@ -231,6 +244,21 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
 				boundspec = stringToNode(TextDatumGetCString(datum));
 			systable_endscan(scan);
 			table_close(pg_class, AccessShareLock);
+
+			/*
+			 * If we still don't get a relpartbound value, then it must be
+			 * because of DETACH CONCURRENTLY.  Restart from above.
+			 *
+			 * Note that the memory context is a short-lived one, so we don't
+			 * need to worry about leaking the part of the partition
+			 * descriptor we've already built.
+			 */
+			if (!boundspec && !retried)
+			{
+				AcceptInvalidationMessages();
+				retried = true;
+				goto retry;
+			}
 		}
 
 		/* Sanity checks. */
-- 
2.39.2

