From caf8eb1706994029308bfd29c8bfce5de17dbff1 Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Mon, 21 Oct 2024 16:17:01 +0400
Subject: [PATCH 4/4] Add GROUP-BY strategy: put the most distinct column at
 the head.

Let's allow GROUP-BY to utilize cost_sort feature which can differentiate
orders of pathkeys lists according to the ndistinct of the first column.
---
 src/backend/optimizer/path/pathkeys.c    | 73 +++++++++++++++++++++++-
 src/test/regress/expected/aggregates.out | 43 +++++++-------
 src/test/regress/sql/aggregates.sql      | 10 ++--
 3 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index e25798972f..7db855fc39 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -26,6 +26,7 @@
 #include "optimizer/paths.h"
 #include "partitioning/partbounds.h"
 #include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
 
 /* Consider reordering of GROUP BY keys? */
 bool		enable_group_by_reordering = true;
@@ -471,6 +472,10 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
 	List	   *pathkeys = root->group_pathkeys;
 	List	   *clauses = root->processed_groupClause;
 
+	double		nd_max = 0;
+	PathKey	   *pk_opt = NULL;
+	ListCell   *lc1, *lc2;
+
 	/* always return at least the original pathkeys/clauses */
 	info = makeNode(GroupByOrdering);
 	info->pathkeys = pathkeys;
@@ -524,9 +529,6 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
 		/* Test consistency of info structures */
 		for_each_from(lc, infos, 1)
 		{
-			ListCell   *lc1,
-					   *lc2;
-
 			info = lfirst_node(GroupByOrdering, lc);
 
 			Assert(list_length(info->clauses) == list_length(pinfo->clauses));
@@ -544,6 +546,71 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
 		}
 	}
 #endif
+
+	/*
+	 * Let's try the order with the column having max ndistinct value
+	 */
+
+	forboth(lc1, root->group_pathkeys, lc2, root->processed_groupClause)
+	{
+		PathKey			   *pkey = lfirst_node(PathKey, lc1);
+		SortGroupClause	   *gc = (SortGroupClause *) lfirst(lc2);
+		Node			   *node;
+		Bitmapset		   *relids;
+		VariableStatData	vardata;
+		double				nd = -1;
+		bool				isdefault;
+
+		if (foreach_current_index(lc1) >= root->num_groupby_pathkeys)
+			break;
+
+		node = get_sortgroupclause_expr(gc, root->parse->targetList);
+		relids = pull_varnos(root, node);
+
+		if (bms_num_members(relids) != 1 && bms_is_member(0, relids))
+			/*
+			 *Although functional index can estimate distincts here, the chance
+			 * is too low.
+			 */
+			continue;
+
+		examine_variable(root, node, 0, &vardata);
+		if (!HeapTupleIsValid(vardata.statsTuple))
+			continue;
+		nd = get_variable_numdistinct(&vardata, &isdefault);
+		ReleaseVariableStats(vardata);
+		if (isdefault)
+			continue;
+
+		Assert(nd >= 0);
+		if (nd_max == 0 || nd > nd_max)
+		{
+			nd_max = nd;
+			pk_opt = pkey;
+		}
+	}
+
+	if (pk_opt != NULL)
+	{
+		List   *new_pathkeys = list_make1(pk_opt);
+		int		n;
+
+		new_pathkeys = list_concat_unique_ptr(new_pathkeys, root->group_pathkeys);
+		n = group_keys_reorder_by_pathkeys(new_pathkeys, &pathkeys, &clauses,
+										   root->num_groupby_pathkeys);
+
+		if (n > 0 &&
+			(enable_incremental_sort || n == root->num_groupby_pathkeys) &&
+			compare_pathkeys(pathkeys, root->group_pathkeys) != PATHKEYS_EQUAL)
+		{
+			info = makeNode(GroupByOrdering);
+			info->pathkeys = pathkeys;
+			info->clauses = clauses;
+
+			infos = lappend(infos, info);
+		}
+	}
+
 	return infos;
 }
 
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 8c05adff86..4af82f9600 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2786,13 +2786,13 @@ SELECT balk(hundred) FROM tenk1;
 ROLLBACK;
 -- GROUP BY optimization by reordering GROUP BY clauses
 CREATE TABLE btg AS SELECT
-  i % 10 AS x,
-  i % 10 AS y,
-  'abc' || i % 10 AS z,
+  i % 231 AS x,
+  i % 49 AS y,
+  'abc' || i % 2 AS z,
   i AS w
-FROM generate_series(1, 100) AS i;
+FROM generate_series(1, 1000) AS i;
 CREATE INDEX btg_x_y_idx ON btg(x, y);
-ANALYZE btg;
+VACUUM ANALYZE btg;
 SET enable_hashagg = off;
 SET enable_seqscan = off;
 -- Utilize the ordering of index scan to avoid a Sort operation
@@ -2839,21 +2839,19 @@ EXPLAIN (COSTS OFF)
 SELECT count(*)
   FROM btg t1 JOIN btg t2 ON t1.z = t2.z AND t1.w = t2.w AND t1.x = t2.x
   GROUP BY t1.x, t1.y, t1.z, t1.w;
-                                  QUERY PLAN                                   
--------------------------------------------------------------------------------
+                           QUERY PLAN                           
+----------------------------------------------------------------
  GroupAggregate
-   Group Key: t1.x, t1.y, t1.z, t1.w
+   Group Key: t1.w, t1.x, t1.y, t1.z
    ->  Sort
-         Sort Key: t1.x, t1.y, t1.z, t1.w
+         Sort Key: t1.w, t1.x, t1.y, t1.z
          ->  Merge Join
-               Merge Cond: ((t1.w = t2.w) AND (t1.z = t2.z) AND (t1.x = t2.x))
-               ->  Sort
-                     Sort Key: t1.w, t1.z, t1.x
-                     ->  Index Scan using btg_x_y_idx on btg t1
-               ->  Sort
-                     Sort Key: t2.w, t2.z, t2.x
+               Merge Cond: (t1.x = t2.x)
+               Join Filter: ((t2.z = t1.z) AND (t2.w = t1.w))
+               ->  Index Scan using btg_x_y_idx on btg t1
+               ->  Materialize
                      ->  Index Scan using btg_x_y_idx on btg t2
-(12 rows)
+(10 rows)
 
 RESET enable_nestloop;
 RESET enable_hashjoin;
@@ -2877,11 +2875,12 @@ SELECT count(*) FROM btg GROUP BY w, x, y, z ORDER BY x*x, z;
  Sort
    Sort Key: ((x * x)), z
    ->  GroupAggregate
-         Group Key: w, x, y, z
-         ->  Sort
-               Sort Key: w, x, y, z
+         Group Key: x, y, w, z
+         ->  Incremental Sort
+               Sort Key: x, y, w, z
+               Presorted Key: x, y
                ->  Index Scan using btg_x_y_idx on btg
-(7 rows)
+(8 rows)
 
 -- Test the case where the number of incoming subtree path keys is more than
 -- the number of grouping keys.
@@ -2918,9 +2917,9 @@ GROUP BY c1.w, c1.z;
                      QUERY PLAN                      
 -----------------------------------------------------
  GroupAggregate
-   Group Key: c1.w, c1.z
+   Group Key: c1.z, c1.w
    ->  Sort
-         Sort Key: c1.w, c1.z, c1.x, c1.y
+         Sort Key: c1.z, c1.w, c1.x, c1.y
          ->  Merge Join
                Merge Cond: (c1.x = c2.x)
                ->  Sort
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index ca6d1bcfb7..f1a9a607b7 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1199,13 +1199,13 @@ ROLLBACK;
 
 -- GROUP BY optimization by reordering GROUP BY clauses
 CREATE TABLE btg AS SELECT
-  i % 10 AS x,
-  i % 10 AS y,
-  'abc' || i % 10 AS z,
+  i % 231 AS x,
+  i % 49 AS y,
+  'abc' || i % 2 AS z,
   i AS w
-FROM generate_series(1, 100) AS i;
+FROM generate_series(1, 1000) AS i;
 CREATE INDEX btg_x_y_idx ON btg(x, y);
-ANALYZE btg;
+VACUUM ANALYZE btg;
 
 SET enable_hashagg = off;
 SET enable_seqscan = off;
-- 
2.47.0