From 22b572903b8c1a2459b051f91e0902a2fc243648 Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Wed, 18 Sep 2024 19:25:02 +0200
Subject: [PATCH v2] Improve statistics estimation considering GROUP-BY as a
 'uniqueiser'.

This commit follows the idea of the 4767bc8ff2. GROUP-BY makes the grouped
column 'highly likely unique'. We can employ this fact in the statistics to
make more precise estimations in the upper query block.
---
 src/backend/utils/adt/selfuncs.c        | 37 +++++++++++++++----------
 src/include/utils/selfuncs.h            |  2 +-
 src/test/regress/expected/stats_ext.out | 28 +++++++++++++++++++
 src/test/regress/sql/stats_ext.sql      | 19 +++++++++++++
 4 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 03d7fb5f48..b6ce8978c1 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -321,8 +321,8 @@ var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation,
 	}
 
 	/*
-	 * If we matched the var to a unique index or DISTINCT clause, assume
-	 * there is exactly one match regardless of anything else.  (This is
+	 * If we matched the var to a unique index, DISTINCT or GROUP-BY clause,
+	 * assume there is exactly one match regardless of anything else.  (This is
 	 * slightly bogus, since the index or clause's equality operator might be
 	 * different from ours, but it's much more likely to be right than
 	 * ignoring the information.)
@@ -483,8 +483,8 @@ var_eq_non_const(VariableStatData *vardata, Oid oproid, Oid collation,
 	}
 
 	/*
-	 * If we matched the var to a unique index or DISTINCT clause, assume
-	 * there is exactly one match regardless of anything else.  (This is
+	 * If we matched the var to a unique index, DISTINCT or GROUP-BY clause,
+	 * assume there is exactly one match regardless of anything else.  (This is
 	 * slightly bogus, since the index or clause's equality operator might be
 	 * different from ours, but it's much more likely to be right than
 	 * ignoring the information.)
@@ -5005,9 +5005,9 @@ ReleaseDummy(HeapTuple tuple)
  *	atttype, atttypmod: actual type/typmod of the "var" expression.  This is
  *		commonly the same as the exposed type of the variable argument,
  *		but can be different in binary-compatible-type cases.
- *	isunique: true if we were able to match the var to a unique index or a
- *		single-column DISTINCT clause, implying its values are unique for
- *		this query.  (Caution: this should be trusted for statistical
+ *	isunique: true if we were able to match the var to a unique index, a
+ *		single-column DISTINCT or GROUP-BY clause, implying its values are
+ *		unique for this query.  (Caution: this should be trusted for statistical
  *		purposes only, since we do not check indimmediate nor verify that
  *		the exact same definition of equality applies.)
  *	acl_ok: true if current user has permission to read the column(s)
@@ -5654,7 +5654,7 @@ examine_simple_variable(PlannerInfo *root, Var *var,
 		Assert(IsA(subquery, Query));
 
 		/*
-		 * Punt if subquery uses set operations or GROUP BY, as these will
+		 * Punt if subquery uses set operations or grouping sets, as these will
 		 * mash underlying columns' stats beyond recognition.  (Set ops are
 		 * particularly nasty; if we forged ahead, we would return stats
 		 * relevant to only the leftmost subselect...)	DISTINCT is also
@@ -5662,7 +5662,6 @@ examine_simple_variable(PlannerInfo *root, Var *var,
 		 * of learning something even with it.
 		 */
 		if (subquery->setOperations ||
-			subquery->groupClause ||
 			subquery->groupingSets)
 			return;
 
@@ -5692,6 +5691,16 @@ examine_simple_variable(PlannerInfo *root, Var *var,
 			return;
 		}
 
+		/* The same idea as with DISTINCT clause works for a GROUP-BY too */
+		if (subquery->groupClause)
+		{
+			if (list_length(subquery->groupClause) == 1 &&
+				targetIsInSortList(ste, InvalidOid, subquery->groupClause))
+				vardata->isunique = true;
+			/* cannot go further */
+			return;
+		}
+
 		/*
 		 * If the sub-query originated from a view with the security_barrier
 		 * attribute, we must not look at the variable's statistics, though it
@@ -5843,11 +5852,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 	}
 
 	/*
-	 * If there is a unique index or DISTINCT clause for the variable, assume
-	 * it is unique no matter what pg_statistic says; the statistics could be
-	 * out of date, or we might have found a partial unique index that proves
-	 * the var is unique for this query.  However, we'd better still believe
-	 * the null-fraction statistic.
+	 * If there is a unique index, DISTINCT or GROUP-BY clause for the variable,
+	 * assume it is unique no matter what pg_statistic says; the statistics
+	 * could be out of date, or we might have found a partial unique index that
+	 * proves the var is unique for this query.  However, we'd better still
+	 * believe the null-fraction statistic.
 	 */
 	if (vardata->isunique)
 		stadistinct = -1.0 * (1.0 - stanullfrac);
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index f2563ad1cb..b1ac81977b 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -94,7 +94,7 @@ typedef struct VariableStatData
 	Oid			vartype;		/* exposed type of expression */
 	Oid			atttype;		/* actual type (after stripping relabel) */
 	int32		atttypmod;		/* actual typmod (after stripping relabel) */
-	bool		isunique;		/* matches unique index or DISTINCT clause */
+	bool		isunique;		/* matches unique index, DISTINCT or GROUP-BY clause */
 	bool		acl_ok;			/* result of ACL check on table or column */
 } VariableStatData;
 
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 8c4da95508..dfe9baf4b2 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3074,6 +3074,34 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE
 (0 rows)
 
 DROP TABLE expr_stats_incompatible_test;
+-- Use the 'GROUP BY x' operator to claim 'x' as a unique column upstream.
+CREATE TABLE gu_1 (x real);
+CREATE TABLE gu_2 (x real);
+CREATE TABLE gu_3 (x numeric);
+INSERT INTO gu_1 (x) VALUES (1);
+INSERT INTO gu_2 (x) SELECT gs FROM generate_series(1,1000) AS gs;
+INSERT INTO gu_3 VALUES ('1.'), ('1.0'), ('1.00'), ('1.000'), ('1.0000');
+VACUUM ANALYZE gu_1,gu_2,gu_3;
+SELECT * FROM check_estimated_rows('
+  SELECT * FROM gu_1 LEFT JOIN (
+    SELECT x FROM gu_2 GROUP BY x) AS q1
+    ON gu_1.x=q1.x;');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+-- Special case when GROUP BY comparison operator differs from the upper one
+SELECT * FROM check_estimated_rows('
+SELECT * FROM gu_1 LEFT JOIN (
+select x::real from (SELECT x::text FROM gu_3) group by x) AS q1
+  ON gu_1.x=q1.x;');
+ estimated | actual 
+-----------+--------
+         1 |      5
+(1 row)
+
+DROP TABLE gu_1,gu_2,gu_3;
 -- Permission tests. Users should not be able to see specific data values in
 -- the extended statistics, if they lack permission to see those values in
 -- the underlying table.
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 0c08a6cc42..5e30007c37 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1547,6 +1547,25 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE
 
 DROP TABLE expr_stats_incompatible_test;
 
+-- Use the 'GROUP BY x' operator to claim 'x' as a unique column upstream.
+CREATE TABLE gu_1 (x real);
+CREATE TABLE gu_2 (x real);
+CREATE TABLE gu_3 (x numeric);
+INSERT INTO gu_1 (x) VALUES (1);
+INSERT INTO gu_2 (x) SELECT gs FROM generate_series(1,1000) AS gs;
+INSERT INTO gu_3 VALUES ('1.'), ('1.0'), ('1.00'), ('1.000'), ('1.0000');
+VACUUM ANALYZE gu_1,gu_2,gu_3;
+SELECT * FROM check_estimated_rows('
+  SELECT * FROM gu_1 LEFT JOIN (
+    SELECT x FROM gu_2 GROUP BY x) AS q1
+    ON gu_1.x=q1.x;');
+-- Special case when GROUP BY comparison operator differs from the upper one
+SELECT * FROM check_estimated_rows('
+SELECT * FROM gu_1 LEFT JOIN (
+select x::real from (SELECT x::text FROM gu_3) group by x) AS q1
+  ON gu_1.x=q1.x;');
+DROP TABLE gu_1,gu_2,gu_3;
+
 -- Permission tests. Users should not be able to see specific data values in
 -- the extended statistics, if they lack permission to see those values in
 -- the underlying table.
-- 
2.46.1