From 22b572903b8c1a2459b051f91e0902a2fc243648 Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" Date: Wed, 18 Sep 2024 19:25:02 +0200 Subject: [PATCH v2] Improve statistics estimation considering GROUP-BY as a 'uniqueiser'. This commit follows the idea of the 4767bc8ff2. GROUP-BY makes the grouped column 'highly likely unique'. We can employ this fact in the statistics to make more precise estimations in the upper query block. --- src/backend/utils/adt/selfuncs.c | 37 +++++++++++++++---------- src/include/utils/selfuncs.h | 2 +- src/test/regress/expected/stats_ext.out | 28 +++++++++++++++++++ src/test/regress/sql/stats_ext.sql | 19 +++++++++++++ 4 files changed, 71 insertions(+), 15 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 03d7fb5f48..b6ce8978c1 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -321,8 +321,8 @@ var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation, } /* - * If we matched the var to a unique index or DISTINCT clause, assume - * there is exactly one match regardless of anything else. (This is + * If we matched the var to a unique index, DISTINCT or GROUP-BY clause, + * assume there is exactly one match regardless of anything else. (This is * slightly bogus, since the index or clause's equality operator might be * different from ours, but it's much more likely to be right than * ignoring the information.) @@ -483,8 +483,8 @@ var_eq_non_const(VariableStatData *vardata, Oid oproid, Oid collation, } /* - * If we matched the var to a unique index or DISTINCT clause, assume - * there is exactly one match regardless of anything else. (This is + * If we matched the var to a unique index, DISTINCT or GROUP-BY clause, + * assume there is exactly one match regardless of anything else. (This is * slightly bogus, since the index or clause's equality operator might be * different from ours, but it's much more likely to be right than * ignoring the information.) @@ -5005,9 +5005,9 @@ ReleaseDummy(HeapTuple tuple) * atttype, atttypmod: actual type/typmod of the "var" expression. This is * commonly the same as the exposed type of the variable argument, * but can be different in binary-compatible-type cases. - * isunique: true if we were able to match the var to a unique index or a - * single-column DISTINCT clause, implying its values are unique for - * this query. (Caution: this should be trusted for statistical + * isunique: true if we were able to match the var to a unique index, a + * single-column DISTINCT or GROUP-BY clause, implying its values are + * unique for this query. (Caution: this should be trusted for statistical * purposes only, since we do not check indimmediate nor verify that * the exact same definition of equality applies.) * acl_ok: true if current user has permission to read the column(s) @@ -5654,7 +5654,7 @@ examine_simple_variable(PlannerInfo *root, Var *var, Assert(IsA(subquery, Query)); /* - * Punt if subquery uses set operations or GROUP BY, as these will + * Punt if subquery uses set operations or grouping sets, as these will * mash underlying columns' stats beyond recognition. (Set ops are * particularly nasty; if we forged ahead, we would return stats * relevant to only the leftmost subselect...) DISTINCT is also @@ -5662,7 +5662,6 @@ examine_simple_variable(PlannerInfo *root, Var *var, * of learning something even with it. */ if (subquery->setOperations || - subquery->groupClause || subquery->groupingSets) return; @@ -5692,6 +5691,16 @@ examine_simple_variable(PlannerInfo *root, Var *var, return; } + /* The same idea as with DISTINCT clause works for a GROUP-BY too */ + if (subquery->groupClause) + { + if (list_length(subquery->groupClause) == 1 && + targetIsInSortList(ste, InvalidOid, subquery->groupClause)) + vardata->isunique = true; + /* cannot go further */ + return; + } + /* * If the sub-query originated from a view with the security_barrier * attribute, we must not look at the variable's statistics, though it @@ -5843,11 +5852,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault) } /* - * If there is a unique index or DISTINCT clause for the variable, assume - * it is unique no matter what pg_statistic says; the statistics could be - * out of date, or we might have found a partial unique index that proves - * the var is unique for this query. However, we'd better still believe - * the null-fraction statistic. + * If there is a unique index, DISTINCT or GROUP-BY clause for the variable, + * assume it is unique no matter what pg_statistic says; the statistics + * could be out of date, or we might have found a partial unique index that + * proves the var is unique for this query. However, we'd better still + * believe the null-fraction statistic. */ if (vardata->isunique) stadistinct = -1.0 * (1.0 - stanullfrac); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index f2563ad1cb..b1ac81977b 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -94,7 +94,7 @@ typedef struct VariableStatData Oid vartype; /* exposed type of expression */ Oid atttype; /* actual type (after stripping relabel) */ int32 atttypmod; /* actual typmod (after stripping relabel) */ - bool isunique; /* matches unique index or DISTINCT clause */ + bool isunique; /* matches unique index, DISTINCT or GROUP-BY clause */ bool acl_ok; /* result of ACL check on table or column */ } VariableStatData; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 8c4da95508..dfe9baf4b2 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3074,6 +3074,34 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE (0 rows) DROP TABLE expr_stats_incompatible_test; +-- Use the 'GROUP BY x' operator to claim 'x' as a unique column upstream. +CREATE TABLE gu_1 (x real); +CREATE TABLE gu_2 (x real); +CREATE TABLE gu_3 (x numeric); +INSERT INTO gu_1 (x) VALUES (1); +INSERT INTO gu_2 (x) SELECT gs FROM generate_series(1,1000) AS gs; +INSERT INTO gu_3 VALUES ('1.'), ('1.0'), ('1.00'), ('1.000'), ('1.0000'); +VACUUM ANALYZE gu_1,gu_2,gu_3; +SELECT * FROM check_estimated_rows(' + SELECT * FROM gu_1 LEFT JOIN ( + SELECT x FROM gu_2 GROUP BY x) AS q1 + ON gu_1.x=q1.x;'); + estimated | actual +-----------+-------- + 1 | 1 +(1 row) + +-- Special case when GROUP BY comparison operator differs from the upper one +SELECT * FROM check_estimated_rows(' +SELECT * FROM gu_1 LEFT JOIN ( +select x::real from (SELECT x::text FROM gu_3) group by x) AS q1 + ON gu_1.x=q1.x;'); + estimated | actual +-----------+-------- + 1 | 5 +(1 row) + +DROP TABLE gu_1,gu_2,gu_3; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 0c08a6cc42..5e30007c37 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1547,6 +1547,25 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE DROP TABLE expr_stats_incompatible_test; +-- Use the 'GROUP BY x' operator to claim 'x' as a unique column upstream. +CREATE TABLE gu_1 (x real); +CREATE TABLE gu_2 (x real); +CREATE TABLE gu_3 (x numeric); +INSERT INTO gu_1 (x) VALUES (1); +INSERT INTO gu_2 (x) SELECT gs FROM generate_series(1,1000) AS gs; +INSERT INTO gu_3 VALUES ('1.'), ('1.0'), ('1.00'), ('1.000'), ('1.0000'); +VACUUM ANALYZE gu_1,gu_2,gu_3; +SELECT * FROM check_estimated_rows(' + SELECT * FROM gu_1 LEFT JOIN ( + SELECT x FROM gu_2 GROUP BY x) AS q1 + ON gu_1.x=q1.x;'); +-- Special case when GROUP BY comparison operator differs from the upper one +SELECT * FROM check_estimated_rows(' +SELECT * FROM gu_1 LEFT JOIN ( +select x::real from (SELECT x::text FROM gu_3) group by x) AS q1 + ON gu_1.x=q1.x;'); +DROP TABLE gu_1,gu_2,gu_3; + -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. -- 2.46.1