From 42603e32bd0ad456a53a96ac2d05ce714f97e1ba Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Sun, 23 Apr 2023 19:26:18 +0200
Subject: [PATCH 1/2] Fix handling of NULLs in BRIN indexes

BRIN indexes did not properly distinguish between summaries for empty
(no rows) and all-NULL ranges. All summaries were initialized with
allnulls=true, and the opclasses simply reset allnulls to false when
processing the first non-NULL value. This however fails if the range
starts with a NULL value (or a sequence of NULL values), in which case
we forget the range contains NULL values.

This happens because the allnulls flag is used for two separate
purposes - to mark empty ranges (not representing any rows yet) and
ranges containing only NULL values.

Opclasses don't know which of these cases it is, and so don't know
whether to set hasnulls=true. Setting hasnulls=true in both cases would
make it correct, but it would also make BRIN indexes useless for queries
with IS NULL clauses - all ranges start empty (and thus allnulls=true),
so all ranges would end up with either allnulls=true or hasnulls=true.

The severity of the issue is somewhat reduced by the fact that it only
happens when adding values to an existing summary with allnulls=true,
not when the summarization is processing values in bulk (e.g. during
CREATE INDEX or automatic summarization). In this case the flags were
updated in a slightly different way, not forgetting the NULL values.

The best solution would be to introduce a new flag marking index tuples
representing ranges with no rows, but that would break on-disk format
and/or ABI, depending on where we put the flag. Considering we need to
backpatch this, that's not acceptable.

So instead we use an "impossible" combination of both flags (allnulls
and hasnulls) set to true, to mark "empty" ranges with no rows. In
principle "empty" is a feature of the whole index tuple, which may
contain multiple summaries in a multi-column index, but this is where
the flags are, unfortunately.

We could also skip storing index tuples for empty summaries, but then
we'd have to always process such ranges - even if there are no rows in
large parts of the table (e.g. after a bulk DELETE), it would still
require reading the pages etc. So we store them, but ignore them when
building the bitmap.

Backpatch to 11. The issue exists since BRIN indexes were introduced in
9.5, but older releases are already EOL.

Backpatch-through: 11
Reviewed-by: Justin Pryzby, Matthias van de Meent
Discussion: https://postgr.es/m/402430e4-7d9d-6cf1-09ef-464d80afff3b@enterprisedb.com
---
 src/backend/access/brin/brin.c                | 226 ++++++++++++++----
 src/backend/access/brin/brin_tuple.c          |  15 +-
 src/include/access/brin_tuple.h               |   6 +-
 ...summarization-and-inprogress-insertion.out |   8 +-
 ...ummarization-and-inprogress-insertion.spec |   1 +
 5 files changed, 201 insertions(+), 55 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 0becfde1133..f99a9ba5cf9 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -35,6 +35,7 @@
 #include "storage/freespace.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
+#include "utils/datum.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -77,7 +78,8 @@ static void form_and_insert_tuple(BrinBuildState *state);
 static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
 						 BrinTuple *b);
 static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
-
+static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
+								BrinMemTuple *dtup, Datum *values, bool *nulls);
 
 /*
  * BRIN handler function: return IndexAmRoutine with access method parameters
@@ -173,11 +175,10 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 
 	for (;;)
 	{
-		bool		need_insert = false;
+		bool		need_insert;
 		OffsetNumber off;
 		BrinTuple  *brtup;
 		BrinMemTuple *dtup;
-		int			keyno;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -241,31 +242,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 
 		dtup = brin_deform_tuple(bdesc, brtup, NULL);
 
-		/*
-		 * Compare the key values of the new tuple to the stored index values;
-		 * our deformed tuple will get updated if the new tuple doesn't fit
-		 * the original range (note this means we can't break out of the loop
-		 * early). Make a note of whether this happens, so that we know to
-		 * insert the modified tuple later.
-		 */
-		for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
-		{
-			Datum		result;
-			BrinValues *bval;
-			FmgrInfo   *addValue;
-
-			bval = &dtup->bt_columns[keyno];
-			addValue = index_getprocinfo(idxRel, keyno + 1,
-										 BRIN_PROCNUM_ADDVALUE);
-			result = FunctionCall4Coll(addValue,
-									   idxRel->rd_indcollation[keyno],
-									   PointerGetDatum(bdesc),
-									   PointerGetDatum(bval),
-									   values[keyno],
-									   nulls[keyno]);
-			/* if that returned true, we need to insert the updated tuple */
-			need_insert |= DatumGetBool(result);
-		}
+		need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
 
 		if (!need_insert)
 		{
@@ -508,6 +485,17 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 									   CurrentMemoryContext);
 					}
 
+					/*
+					 * If the BRIN tuple indicates that this range is empty,
+					 * we can skip it: there's nothing to match.  We don't
+					 * need to examine the next columns.
+					 */
+					if (dtup->bt_empty_range)
+					{
+						addrange = false;
+						break;
+					}
+
 					/*
 					 * Check whether the scan key is consistent with the page
 					 * range values; if so, have the pages in the range added
@@ -611,7 +599,6 @@ brinbuildCallback(Relation index,
 {
 	BrinBuildState *state = (BrinBuildState *) brstate;
 	BlockNumber thisblock;
-	int			i;
 
 	thisblock = ItemPointerGetBlockNumber(tid);
 
@@ -640,25 +627,8 @@ brinbuildCallback(Relation index,
 	}
 
 	/* Accumulate the current tuple into the running state */
-	for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
-	{
-		FmgrInfo   *addValue;
-		BrinValues *col;
-		Form_pg_attribute attr = TupleDescAttr(state->bs_bdesc->bd_tupdesc, i);
-
-		col = &state->bs_dtuple->bt_columns[i];
-		addValue = index_getprocinfo(index, i + 1,
-									 BRIN_PROCNUM_ADDVALUE);
-
-		/*
-		 * Update dtuple state, if and as necessary.
-		 */
-		FunctionCall4Coll(addValue,
-						  attr->attcollation,
-						  PointerGetDatum(state->bs_bdesc),
-						  PointerGetDatum(col),
-						  values[i], isnull[i]);
-	}
+	(void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
+							   values, isnull);
 }
 
 /*
@@ -1448,6 +1418,8 @@ form_and_insert_tuple(BrinBuildState *state)
 /*
  * Given two deformed tuples, adjust the first one so that it's consistent
  * with the summary values in both.
+ *
+ * XXX I'm not sure we can actually get empty "b".
  */
 static void
 union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
@@ -1465,6 +1437,64 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
 	db = brin_deform_tuple(bdesc, b, NULL);
 	MemoryContextSwitchTo(oldcxt);
 
+	/*
+	 * Check if the ranges are empty.
+	 *
+	 * If at least one of them is empty, we don't need to call per-key union
+	 * functions at all. If "b" is empty, we just use "a" as the result (it
+	 * might be empty fine, but that's fine). If "a" is empty but "b" is not,
+	 * we use "b" as the result (but we have to copy the data into "a" first).
+	 *
+	 * Only when both ranges are non-empty, we actually do the per-key merge.
+	 */
+
+	/* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
+	if (db->bt_empty_range)
+	{
+		/* skip the per-key merge */
+		MemoryContextDelete(cxt);
+		return;
+	}
+
+	/*
+	 * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
+	 * But we need to copy the data from "b" to "a" first, because that's how
+	 * we pass result out.
+	 *
+	 * We have to copy all the global/per-key flags etc. too.
+	 */
+	if (a->bt_empty_range)
+	{
+		for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+		{
+			int			i;
+			BrinValues *col_a = &a->bt_columns[keyno];
+			BrinValues *col_b = &db->bt_columns[keyno];
+			BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
+
+			col_a->bv_allnulls = col_b->bv_allnulls;
+			col_a->bv_hasnulls = col_b->bv_hasnulls;
+
+			/* If "b" has no data, we're done. */
+			if (col_b->bv_allnulls)
+				continue;
+
+			for (i = 0; i < opcinfo->oi_nstored; i++)
+				col_a->bv_values[i] =
+					datumCopy(col_b->bv_values[i],
+							  opcinfo->oi_typcache[i]->typbyval,
+							  opcinfo->oi_typcache[i]->typlen);
+		}
+
+		/* "a" started empty, but "b" was not empty, so remember that */
+		a->bt_empty_range = false;
+
+		/* skip the per-key merge */
+		MemoryContextDelete(cxt);
+		return;
+	}
+
+	/* Now we know neither range is empty. */
 	for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
 	{
 		FmgrInfo   *unionFn;
@@ -1523,3 +1553,103 @@ brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
 	 */
 	FreeSpaceMapVacuum(idxrel);
 }
+
+static bool
+add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
+					Datum *values, bool *nulls)
+{
+	int			keyno;
+
+	/* If the range starts empty, we're certainly going to modify it. */
+	bool		modified = dtup->bt_empty_range;
+
+	/*
+	 * Compare the key values of the new tuple to the stored index values;
+	 * our deformed tuple will get updated if the new tuple doesn't fit
+	 * the original range (note this means we can't break out of the loop
+	 * early). Make a note of whether this happens, so that we know to
+	 * insert the modified tuple later.
+	 */
+	for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+	{
+		Datum		result;
+		BrinValues *bval;
+		FmgrInfo   *addValue;
+		bool		has_nulls;
+
+		bval = &dtup->bt_columns[keyno];
+
+		/*
+		 * Does the range have actual NULL values? Either of the flags can
+		 * be set, but we ignore the state before adding first row.
+		 *
+		 * We have to remember this, because we'll modify the flags and we
+		 * need to know if the range started as empty.
+		 */
+		has_nulls = ((!dtup->bt_empty_range) &&
+					 (bval->bv_hasnulls || bval->bv_allnulls));
+
+		/*
+		 * If the value we're adding is NULL, handle it locally. Otherwise
+		 * call the BRIN_PROCNUM_ADDVALUE procedure.
+		 */
+		if (nulls[keyno])
+		{
+			/*
+			 * We can't check "bv_hasnulls" because then we might end up with
+			 * both flags set to true, which is interpreted as empty range.
+			 * But that'd be wrong, because we've just added a value.
+			 *
+			 * So either the range has allnulls=true, or we have to set the
+			 * hasnulls flag. Check if we're changing the value to determine
+			 * if the index tuple was modified.
+			 */
+			if (!bval->bv_allnulls)
+			{
+				/* Are we changing the tuple? */
+				modified |= (!bval->bv_hasnulls);
+				bval->bv_hasnulls = true;
+			}
+		}
+		else
+		{
+			addValue = index_getprocinfo(idxRel, keyno + 1,
+										 BRIN_PROCNUM_ADDVALUE);
+			result = FunctionCall4Coll(addValue,
+									   idxRel->rd_indcollation[keyno],
+									   PointerGetDatum(bdesc),
+									   PointerGetDatum(bval),
+									   values[keyno],
+									   nulls[keyno]);
+			/* if that returned true, we need to insert the updated tuple */
+			modified |= DatumGetBool(result);
+		}
+
+		/*
+		 * If the range was had actual NULL values (i.e. did not start empty),
+		 * make sure we don't forget about the NULL values. Either the allnulls
+		 * flag is still set to true, or (if the opclass cleared it) we need to
+		 * set hasnulls=true.
+		 *
+		 * XXX This can only happen when the opclass modified the tuple, so the
+		 * modified flag should be set.
+		 */
+		if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
+		{
+			Assert(modified);
+			bval->bv_hasnulls = true;
+		}
+	}
+
+	/*
+	 * After updating summaries for all the keys, mark it as not empty.
+	 *
+	 * If we're actually changing the flag value (i.e. tuple started as
+	 * empty), we should have modified the tuple. So we should not see
+	 * empty range that was not modified.
+	 */
+	Assert(!dtup->bt_empty_range || modified);
+	dtup->bt_empty_range = false;
+
+	return modified;
+}
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
index b3b453aed12..f9877980f4d 100644
--- a/src/backend/access/brin/brin_tuple.c
+++ b/src/backend/access/brin/brin_tuple.c
@@ -349,6 +349,9 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
 	if (tuple->bt_placeholder)
 		rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
 
+	if (tuple->bt_empty_range)
+		rettuple->bt_info |= BRIN_EMPTY_RANGE_MASK;
+
 	*size = len;
 	return rettuple;
 }
@@ -376,7 +379,7 @@ brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
 	rettuple = palloc0(len);
 	rettuple->bt_blkno = blkno;
 	rettuple->bt_info = hoff;
-	rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
+	rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK | BRIN_EMPTY_RANGE_MASK;
 
 	bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
 	bitmask = HIGHBIT;
@@ -466,6 +469,8 @@ brin_new_memtuple(BrinDesc *brdesc)
 	dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
 	dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
 
+	dtup->bt_empty_range = true;
+
 	dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
 											 "brin dtuple",
 											 ALLOCSET_DEFAULT_SIZES);
@@ -499,6 +504,9 @@ brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
 		currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
 	}
 
+	/* FIXME Shouldn't this reset bt_placeholder too? */
+	dtuple->bt_empty_range = true;
+
 	return dtuple;
 }
 
@@ -532,6 +540,11 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
 
 	if (BrinTupleIsPlaceholder(tuple))
 		dtup->bt_placeholder = true;
+
+	/* ranges start as empty, depends on the BrinTuple */
+	if (!BrinTupleIsEmptyRange(tuple))
+		dtup->bt_empty_range = false;
+
 	dtup->bt_blkno = tuple->bt_blkno;
 
 	values = dtup->bt_values;
diff --git a/src/include/access/brin_tuple.h b/src/include/access/brin_tuple.h
index a9ccc3995b4..5280872707a 100644
--- a/src/include/access/brin_tuple.h
+++ b/src/include/access/brin_tuple.h
@@ -36,6 +36,7 @@ typedef struct BrinValues
 typedef struct BrinMemTuple
 {
 	bool		bt_placeholder; /* this is a placeholder tuple */
+	bool		bt_empty_range;	/* range represents no tuples */
 	BlockNumber bt_blkno;		/* heap blkno that the tuple is for */
 	MemoryContext bt_context;	/* memcxt holding the bt_columns values */
 	/* output arrays for brin_deform_tuple: */
@@ -61,7 +62,7 @@ typedef struct BrinTuple
 	 *
 	 * 7th (high) bit: has nulls
 	 * 6th bit: is placeholder tuple
-	 * 5th bit: unused
+	 * 5th bit: range is empty
 	 * 4-0 bit: offset of data
 	 * ---------------
 	 */
@@ -74,13 +75,14 @@ typedef struct BrinTuple
  * bt_info manipulation macros
  */
 #define BRIN_OFFSET_MASK		0x1F
-/* bit 0x20 is not used at present */
+#define BRIN_EMPTY_RANGE_MASK	0x20
 #define BRIN_PLACEHOLDER_MASK	0x40
 #define BRIN_NULLS_MASK			0x80
 
 #define BrinTupleDataOffset(tup)	((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
 #define BrinTupleHasNulls(tup)	(((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
 #define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
+#define BrinTupleIsEmptyRange(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_EMPTY_RANGE_MASK)) != 0)
 
 
 extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,
diff --git a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
index 2a4755d0998..584ac2602f7 100644
--- a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
+++ b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
@@ -4,7 +4,7 @@ starting permutation: s2check s1b s2b s1i s2summ s1c s2c s2check
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value   
 ----------+------+------+--------+--------+-----------+--------
-         1|     0|     1|f       |f       |f          |{1 .. 1}
+         1|     0|     1|f       |t       |f          |{1 .. 1}
 (1 row)
 
 step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
@@ -26,7 +26,7 @@ step s2c: COMMIT;
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value      
 ----------+------+------+--------+--------+-----------+-----------
-         1|     0|     1|f       |f       |f          |{1 .. 1}   
+         1|     0|     1|f       |t       |f          |{1 .. 1}   
          2|     1|     1|f       |f       |f          |{1 .. 1000}
 (2 rows)
 
@@ -35,7 +35,7 @@ starting permutation: s2check s1b s1i s2vacuum s1c s2check
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value   
 ----------+------+------+--------+--------+-----------+--------
-         1|     0|     1|f       |f       |f          |{1 .. 1}
+         1|     0|     1|f       |t       |f          |{1 .. 1}
 (1 row)
 
 step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
@@ -45,7 +45,7 @@ step s1c: COMMIT;
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value      
 ----------+------+------+--------+--------+-----------+-----------
-         1|     0|     1|f       |f       |f          |{1 .. 1}   
+         1|     0|     1|f       |t       |f          |{1 .. 1}   
          2|     1|     1|f       |f       |f          |{1 .. 1000}
 (2 rows)
 
diff --git a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
index 19ac18a2e88..18ba92b7ba1 100644
--- a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
+++ b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
@@ -9,6 +9,7 @@ setup
     ) WITH (fillfactor=10);
     CREATE INDEX brinidx ON brin_iso USING brin (value) WITH (pages_per_range=1);
     -- this fills the first page
+    INSERT INTO brin_iso VALUES (NULL);
     DO $$
     DECLARE curtid tid;
     BEGIN
-- 
2.40.0

