From 604899b6afe70eccbbdbf69ce254f37808c598db Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Mon, 27 Nov 2023 11:23:48 +0700
Subject: [PATCH] Choose an index path with the best selectivity estimation.

In the case when optimizer predicts only one row prefer choosing UNIQUE indexes
In other cases, if optimizer treats indexes as equal, make a last attempt
selecting the index with less selectivity - this decision takes away dependency
on the order of indexes in an index list (good for reproduction of some issues)
and proposes one more objective argument to choose specific index.
---
 src/backend/optimizer/util/pathnode.c         | 42 ++++++++++++++++++
 .../expected/drop-index-concurrently-1.out    | 16 ++++---
 src/test/regress/expected/functional_deps.out | 43 +++++++++++++++++++
 src/test/regress/expected/join.out            | 40 +++++++++--------
 src/test/regress/sql/functional_deps.sql      | 36 ++++++++++++++++
 5 files changed, 151 insertions(+), 26 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 0b1d17b9d3..4b5aedd579 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -454,6 +454,48 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		costcmp = compare_path_costs_fuzzily(new_path, old_path,
 											 STD_FUZZ_FACTOR);
 
+		/*
+		 * Apply some heuristics on index paths.
+		 */
+		if (IsA(new_path, IndexPath) && IsA(old_path, IndexPath))
+		{
+			IndexPath *inp = (IndexPath *) new_path;
+			IndexPath *iop = (IndexPath *) old_path;
+
+			if (new_path->rows <= 1.0 && old_path->rows <= 1.0)
+			{
+				/*
+				 * When both paths are predicted to produce only one tuple,
+				 * the optimiser should prefer choosing a unique index scan
+				 * in all cases.
+				 */
+				if (inp->indexinfo->unique && !iop->indexinfo->unique)
+					costcmp = COSTS_BETTER1;
+				else if (!inp->indexinfo->unique && iop->indexinfo->unique)
+					costcmp = COSTS_BETTER2;
+				else if (costcmp != COSTS_DIFFERENT)
+					/*
+					 * If the optimiser doesn't have an obviously stable choice
+					 * of unique index, increase the chance of avoiding mistakes
+					 * by choosing an index with smaller selectivity.
+					 * This option makes decision more conservative and looks
+					 * debatable.
+					 */
+					costcmp = (inp->indexselectivity < iop->indexselectivity) ?
+												COSTS_BETTER1 : COSTS_BETTER2;
+			}
+			else if (costcmp == COSTS_EQUAL)
+				/*
+				 * The optimizer can't differ the value of two index paths.
+				 * To avoid making a decision that is based on only an index
+				 * order in the list, use some rational strategy based on
+				 * selectivity: prefer touching fewer tuples on the disk to
+				 * filtering them after.
+				 */
+				costcmp = (inp->indexselectivity < iop->indexselectivity) ?
+												COSTS_BETTER1 : COSTS_BETTER2;
+		}
+
 		/*
 		 * If the two paths compare differently for startup and total cost,
 		 * then we want to keep both, and we can skip comparing pathkeys and
diff --git a/src/test/isolation/expected/drop-index-concurrently-1.out b/src/test/isolation/expected/drop-index-concurrently-1.out
index 1cb2250891..2392cdb033 100644
--- a/src/test/isolation/expected/drop-index-concurrently-1.out
+++ b/src/test/isolation/expected/drop-index-concurrently-1.out
@@ -12,13 +12,15 @@ step preps: PREPARE getrow_seqscan AS SELECT * FROM test_dc WHERE data = 34 ORDE
 step begin: BEGIN;
 step disableseq: SET enable_seqscan = false;
 step explaini: EXPLAIN (COSTS OFF) EXECUTE getrow_idxscan;
-QUERY PLAN                                    
-----------------------------------------------
-Sort                                          
-  Sort Key: id                                
-  ->  Index Scan using test_dc_data on test_dc
-        Index Cond: (data = 34)               
-(4 rows)
+QUERY PLAN                                   
+---------------------------------------------
+Sort                                         
+  Sort Key: id                               
+  ->  Bitmap Heap Scan on test_dc            
+        Recheck Cond: (data = 34)            
+        ->  Bitmap Index Scan on test_dc_data
+              Index Cond: (data = 34)        
+(6 rows)
 
 step enableseq: SET enable_seqscan = true;
 step explains: EXPLAIN (COSTS OFF) EXECUTE getrow_seqscan;
diff --git a/src/test/regress/expected/functional_deps.out b/src/test/regress/expected/functional_deps.out
index 32381b8ae7..61af99a041 100644
--- a/src/test/regress/expected/functional_deps.out
+++ b/src/test/regress/expected/functional_deps.out
@@ -230,3 +230,46 @@ EXECUTE foo;
 ALTER TABLE articles DROP CONSTRAINT articles_pkey RESTRICT;
 EXECUTE foo;  -- fail
 ERROR:  column "articles.keywords" must appear in the GROUP BY clause or be used in an aggregate function
+/*
+ * Corner cases of PostgreSQL optimizer:
+ *
+ * Correlations between columns aren't found by DBMS.
+ * Selectivities multiplication of many columns increases total selectivity
+ * error. If such non-true selectivity is so small, that rows estimation
+ * give us absolute minimum (1) then the optimizer can't choose between different
+ * indexes and uses first from the index list (last created).
+ */
+\set scale 100000
+CREATE TABLE t AS (
+    SELECT  c1 AS c1, -- selectivity(c1)*selectivity(c2)*nrows <= 1
+            c1 AS c2,
+			-- Create two columns with different selectivity.
+            (c1 % 2) AS c3, -- missing from a good index.
+            (c1 % 4) AS c4 -- missing from a bad index.
+    FROM generate_series(1,:scale) AS c1
+);
+UPDATE t SET c1=1,c2=1,c3=1,c4=2 WHERE c1<:scale/1000;
+CREATE INDEX bad ON t (c1,c2,c3);
+CREATE INDEX good ON t (c1,c2,c4);
+ANALYZE t; -- update stat on the indexes.
+EXPLAIN (COSTS OFF) SELECT * FROM t WHERE c1=1 AND c2=1 AND c3=1 AND c4=1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Index Scan using good on t
+   Index Cond: ((c1 = 1) AND (c2 = 1) AND (c4 = 1))
+   Filter: (c3 = 1)
+(3 rows)
+
+-- Set the bad index to the first position in the index list.
+DROP INDEX bad;
+CREATE INDEX bad ON t (c1,c2,c3);
+ANALYZE t;
+EXPLAIN (COSTS OFF) SELECT * FROM t WHERE c1=1 AND c2=1 AND c3=1 AND c4=1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Index Scan using good on t
+   Index Cond: ((c1 = 1) AND (c2 = 1) AND (c4 = 1))
+   Filter: (c3 = 1)
+(3 rows)
+
+DROP TABLE t CASCADE;
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 2c73270143..32b33fabd3 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -8629,14 +8629,15 @@ analyze j2;
 explain (costs off) select * from j1
 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
 where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
-               QUERY PLAN                
------------------------------------------
+                       QUERY PLAN                        
+---------------------------------------------------------
  Merge Join
-   Merge Cond: (j1.id1 = j2.id1)
-   Join Filter: (j2.id2 = j1.id2)
-   ->  Index Scan using j1_id1_idx on j1
-   ->  Index Scan using j2_id1_idx on j2
-(5 rows)
+   Merge Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+   ->  Index Only Scan using j1_pkey on j1
+         Filter: ((id1 % 1000) = 1)
+   ->  Index Only Scan using j2_pkey on j2
+         Filter: ((id1 % 1000) = 1)
+(6 rows)
 
 select * from j1
 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
@@ -8651,15 +8652,16 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
 explain (costs off) select * from j1
 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
 where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 = any (array[1]);
-                     QUERY PLAN                     
-----------------------------------------------------
+                       QUERY PLAN                        
+---------------------------------------------------------
  Merge Join
-   Merge Cond: (j1.id1 = j2.id1)
-   Join Filter: (j2.id2 = j1.id2)
-   ->  Index Scan using j1_id1_idx on j1
-   ->  Index Scan using j2_id1_idx on j2
+   Merge Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+   ->  Index Only Scan using j1_pkey on j1
+         Filter: ((id1 % 1000) = 1)
+   ->  Index Only Scan using j2_pkey on j2
          Index Cond: (id1 = ANY ('{1}'::integer[]))
-(6 rows)
+         Filter: ((id1 % 1000) = 1)
+(7 rows)
 
 select * from j1
 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
@@ -8674,12 +8676,12 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 = any (array[1]);
 explain (costs off) select * from j1
 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
 where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]);
-                      QUERY PLAN                       
--------------------------------------------------------
+                       QUERY PLAN                        
+---------------------------------------------------------
  Merge Join
-   Merge Cond: (j1.id1 = j2.id1)
-   Join Filter: (j2.id2 = j1.id2)
-   ->  Index Scan using j1_id1_idx on j1
+   Merge Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+   ->  Index Only Scan using j1_pkey on j1
+         Filter: ((id1 % 1000) = 1)
    ->  Index Only Scan using j2_pkey on j2
          Index Cond: (id1 >= ANY ('{1,5}'::integer[]))
          Filter: ((id1 % 1000) = 1)
diff --git a/src/test/regress/sql/functional_deps.sql b/src/test/regress/sql/functional_deps.sql
index 406490b995..f617ee9269 100644
--- a/src/test/regress/sql/functional_deps.sql
+++ b/src/test/regress/sql/functional_deps.sql
@@ -208,3 +208,39 @@ EXECUTE foo;
 ALTER TABLE articles DROP CONSTRAINT articles_pkey RESTRICT;
 
 EXECUTE foo;  -- fail
+
+/*
+ * Corner cases of PostgreSQL optimizer:
+ *
+ * Correlations between columns aren't found by DBMS.
+ * Selectivities multiplication of many columns increases total selectivity
+ * error. If such non-true selectivity is so small, that rows estimation
+ * give us absolute minimum (1) then the optimizer can't choose between different
+ * indexes and uses first from the index list (last created).
+ */
+\set scale 100000
+
+CREATE TABLE t AS (
+    SELECT  c1 AS c1, -- selectivity(c1)*selectivity(c2)*nrows <= 1
+            c1 AS c2,
+			-- Create two columns with different selectivity.
+            (c1 % 2) AS c3, -- missing from a good index.
+            (c1 % 4) AS c4 -- missing from a bad index.
+    FROM generate_series(1,:scale) AS c1
+);
+UPDATE t SET c1=1,c2=1,c3=1,c4=2 WHERE c1<:scale/1000;
+
+CREATE INDEX bad ON t (c1,c2,c3);
+CREATE INDEX good ON t (c1,c2,c4);
+ANALYZE t; -- update stat on the indexes.
+
+EXPLAIN (COSTS OFF) SELECT * FROM t WHERE c1=1 AND c2=1 AND c3=1 AND c4=1;
+
+-- Set the bad index to the first position in the index list.
+DROP INDEX bad;
+CREATE INDEX bad ON t (c1,c2,c3);
+ANALYZE t;
+
+EXPLAIN (COSTS OFF) SELECT * FROM t WHERE c1=1 AND c2=1 AND c3=1 AND c4=1;
+
+DROP TABLE t CASCADE;
-- 
2.43.0