From caf8eb1706994029308bfd29c8bfce5de17dbff1 Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" Date: Mon, 21 Oct 2024 16:17:01 +0400 Subject: [PATCH 4/4] Add GROUP-BY strategy: put the most distinct column at the head. Let's allow GROUP-BY to utilize cost_sort feature which can differentiate orders of pathkeys lists according to the ndistinct of the first column. --- src/backend/optimizer/path/pathkeys.c | 73 +++++++++++++++++++++++- src/test/regress/expected/aggregates.out | 43 +++++++------- src/test/regress/sql/aggregates.sql | 10 ++-- 3 files changed, 96 insertions(+), 30 deletions(-) diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index e25798972f..7db855fc39 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -26,6 +26,7 @@ #include "optimizer/paths.h" #include "partitioning/partbounds.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" /* Consider reordering of GROUP BY keys? */ bool enable_group_by_reordering = true; @@ -471,6 +472,10 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path) List *pathkeys = root->group_pathkeys; List *clauses = root->processed_groupClause; + double nd_max = 0; + PathKey *pk_opt = NULL; + ListCell *lc1, *lc2; + /* always return at least the original pathkeys/clauses */ info = makeNode(GroupByOrdering); info->pathkeys = pathkeys; @@ -524,9 +529,6 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path) /* Test consistency of info structures */ for_each_from(lc, infos, 1) { - ListCell *lc1, - *lc2; - info = lfirst_node(GroupByOrdering, lc); Assert(list_length(info->clauses) == list_length(pinfo->clauses)); @@ -544,6 +546,71 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path) } } #endif + + /* + * Let's try the order with the column having max ndistinct value + */ + + forboth(lc1, root->group_pathkeys, lc2, root->processed_groupClause) + { + PathKey *pkey = lfirst_node(PathKey, lc1); + SortGroupClause *gc = (SortGroupClause *) lfirst(lc2); + Node *node; + Bitmapset *relids; + VariableStatData vardata; + double nd = -1; + bool isdefault; + + if (foreach_current_index(lc1) >= root->num_groupby_pathkeys) + break; + + node = get_sortgroupclause_expr(gc, root->parse->targetList); + relids = pull_varnos(root, node); + + if (bms_num_members(relids) != 1 && bms_is_member(0, relids)) + /* + *Although functional index can estimate distincts here, the chance + * is too low. + */ + continue; + + examine_variable(root, node, 0, &vardata); + if (!HeapTupleIsValid(vardata.statsTuple)) + continue; + nd = get_variable_numdistinct(&vardata, &isdefault); + ReleaseVariableStats(vardata); + if (isdefault) + continue; + + Assert(nd >= 0); + if (nd_max == 0 || nd > nd_max) + { + nd_max = nd; + pk_opt = pkey; + } + } + + if (pk_opt != NULL) + { + List *new_pathkeys = list_make1(pk_opt); + int n; + + new_pathkeys = list_concat_unique_ptr(new_pathkeys, root->group_pathkeys); + n = group_keys_reorder_by_pathkeys(new_pathkeys, &pathkeys, &clauses, + root->num_groupby_pathkeys); + + if (n > 0 && + (enable_incremental_sort || n == root->num_groupby_pathkeys) && + compare_pathkeys(pathkeys, root->group_pathkeys) != PATHKEYS_EQUAL) + { + info = makeNode(GroupByOrdering); + info->pathkeys = pathkeys; + info->clauses = clauses; + + infos = lappend(infos, info); + } + } + return infos; } diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 8c05adff86..4af82f9600 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2786,13 +2786,13 @@ SELECT balk(hundred) FROM tenk1; ROLLBACK; -- GROUP BY optimization by reordering GROUP BY clauses CREATE TABLE btg AS SELECT - i % 10 AS x, - i % 10 AS y, - 'abc' || i % 10 AS z, + i % 231 AS x, + i % 49 AS y, + 'abc' || i % 2 AS z, i AS w -FROM generate_series(1, 100) AS i; +FROM generate_series(1, 1000) AS i; CREATE INDEX btg_x_y_idx ON btg(x, y); -ANALYZE btg; +VACUUM ANALYZE btg; SET enable_hashagg = off; SET enable_seqscan = off; -- Utilize the ordering of index scan to avoid a Sort operation @@ -2839,21 +2839,19 @@ EXPLAIN (COSTS OFF) SELECT count(*) FROM btg t1 JOIN btg t2 ON t1.z = t2.z AND t1.w = t2.w AND t1.x = t2.x GROUP BY t1.x, t1.y, t1.z, t1.w; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------- GroupAggregate - Group Key: t1.x, t1.y, t1.z, t1.w + Group Key: t1.w, t1.x, t1.y, t1.z -> Sort - Sort Key: t1.x, t1.y, t1.z, t1.w + Sort Key: t1.w, t1.x, t1.y, t1.z -> Merge Join - Merge Cond: ((t1.w = t2.w) AND (t1.z = t2.z) AND (t1.x = t2.x)) - -> Sort - Sort Key: t1.w, t1.z, t1.x - -> Index Scan using btg_x_y_idx on btg t1 - -> Sort - Sort Key: t2.w, t2.z, t2.x + Merge Cond: (t1.x = t2.x) + Join Filter: ((t2.z = t1.z) AND (t2.w = t1.w)) + -> Index Scan using btg_x_y_idx on btg t1 + -> Materialize -> Index Scan using btg_x_y_idx on btg t2 -(12 rows) +(10 rows) RESET enable_nestloop; RESET enable_hashjoin; @@ -2877,11 +2875,12 @@ SELECT count(*) FROM btg GROUP BY w, x, y, z ORDER BY x*x, z; Sort Sort Key: ((x * x)), z -> GroupAggregate - Group Key: w, x, y, z - -> Sort - Sort Key: w, x, y, z + Group Key: x, y, w, z + -> Incremental Sort + Sort Key: x, y, w, z + Presorted Key: x, y -> Index Scan using btg_x_y_idx on btg -(7 rows) +(8 rows) -- Test the case where the number of incoming subtree path keys is more than -- the number of grouping keys. @@ -2918,9 +2917,9 @@ GROUP BY c1.w, c1.z; QUERY PLAN ----------------------------------------------------- GroupAggregate - Group Key: c1.w, c1.z + Group Key: c1.z, c1.w -> Sort - Sort Key: c1.w, c1.z, c1.x, c1.y + Sort Key: c1.z, c1.w, c1.x, c1.y -> Merge Join Merge Cond: (c1.x = c2.x) -> Sort diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index ca6d1bcfb7..f1a9a607b7 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -1199,13 +1199,13 @@ ROLLBACK; -- GROUP BY optimization by reordering GROUP BY clauses CREATE TABLE btg AS SELECT - i % 10 AS x, - i % 10 AS y, - 'abc' || i % 10 AS z, + i % 231 AS x, + i % 49 AS y, + 'abc' || i % 2 AS z, i AS w -FROM generate_series(1, 100) AS i; +FROM generate_series(1, 1000) AS i; CREATE INDEX btg_x_y_idx ON btg(x, y); -ANALYZE btg; +VACUUM ANALYZE btg; SET enable_hashagg = off; SET enable_seqscan = off; -- 2.47.0