From cfb5973ac86aa32d12bedf472256ad357eaf5fec Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" Date: Tue, 22 Oct 2024 16:54:49 +0400 Subject: [PATCH 3/4] Consider ndistinct on the first column in cost_sort(). --- src/backend/optimizer/path/costsize.c | 46 ++++++++++++++++++-- src/test/regress/expected/aggregates.out | 24 +++++----- src/test/regress/expected/partition_join.out | 18 ++++---- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 8d50905ba8..8935185a5e 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -194,6 +194,8 @@ static double page_size(double tuples, int width); static double get_parallel_divisor(Path *path); static EquivalenceMember *identify_sort_ecmember(PlannerInfo *root, EquivalenceClass *ec); +static double sort_comparisons_factor(PlannerInfo *root, List *pathkeys, + double ntuples); /* @@ -2113,8 +2115,7 @@ cost_sort(Path *path, PlannerInfo *root, { Cost startup_cost; Cost run_cost; - double cmpfrac = - (pathkeys == NIL) ? 2.0 : list_length(pathkeys) + 1.0; + double cmpfrac = sort_comparisons_factor(root, pathkeys, tuples); cost_tuplesort(&startup_cost, &run_cost, tuples, width, cmpfrac, @@ -6560,4 +6561,43 @@ identify_sort_ecmember(PlannerInfo *root, EquivalenceClass *ec) Assert(candidate != NULL); return candidate; -} \ No newline at end of file +} + +/* + * Calculate multiplier reflecting the number of comparisons which executor + * have to perform during the sort with this specific order of columns. + * + * The comparison factor f = 1.+F(pathkeys). There 1. incapsulates the + * second-order of significance phusics which cost function doesn't consider. + * F(pathkeys) is the estimated fraction of comparisons in the range [1..N]. + * F = 1 corresponds the 'all-unique' first column case. In that case the sort + * will call comparison function only once for each couple of tuples. + * F = N represents the case, when values in all columns are constant. + */ +static double +sort_comparisons_factor(PlannerInfo *root, List *pathkeys, double ntuples) +{ + int n = list_length(pathkeys); + double cmpfrac = (n == 0) ? 2.0 : n + 1; + + if (root != NULL && ntuples > 1 && n > 1) + { + PathKey *key = linitial_node(PathKey, pathkeys); + EquivalenceMember *em = identify_sort_ecmember(root, key->pk_eclass); + + Assert(em->em_ndistinct >= 0); + + if (em->em_ndistinct == 0.) + /* + * Optimiser doesn't have an info on ndistinct value, return + * extreme case + */ + return cmpfrac; + + if (ntuples >= em->em_ndistinct) + cmpfrac = + 2.0 + ((ntuples - em->em_ndistinct) / (ntuples - 1)) * (n - 1); + } + + return cmpfrac; +} diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 45e08457df..8c05adff86 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2842,19 +2842,18 @@ SELECT count(*) QUERY PLAN ------------------------------------------------------------------------------- GroupAggregate - Group Key: t1.z, t1.w, t1.x, t1.y - -> Incremental Sort - Sort Key: t1.z, t1.w, t1.x, t1.y - Presorted Key: t1.z, t1.w, t1.x + Group Key: t1.x, t1.y, t1.z, t1.w + -> Sort + Sort Key: t1.x, t1.y, t1.z, t1.w -> Merge Join - Merge Cond: ((t1.z = t2.z) AND (t1.w = t2.w) AND (t1.x = t2.x)) + Merge Cond: ((t1.w = t2.w) AND (t1.z = t2.z) AND (t1.x = t2.x)) -> Sort - Sort Key: t1.z, t1.w, t1.x + Sort Key: t1.w, t1.z, t1.x -> Index Scan using btg_x_y_idx on btg t1 -> Sort - Sort Key: t2.z, t2.w, t2.x + Sort Key: t2.w, t2.z, t2.x -> Index Scan using btg_x_y_idx on btg t2 -(13 rows) +(12 rows) RESET enable_nestloop; RESET enable_hashjoin; @@ -2878,12 +2877,11 @@ SELECT count(*) FROM btg GROUP BY w, x, y, z ORDER BY x*x, z; Sort Sort Key: ((x * x)), z -> GroupAggregate - Group Key: x, y, w, z - -> Incremental Sort - Sort Key: x, y, w, z - Presorted Key: x, y + Group Key: w, x, y, z + -> Sort + Sort Key: w, x, y, z -> Index Scan using btg_x_y_idx on btg -(8 rows) +(7 rows) -- Test the case where the number of incoming subtree path keys is more than -- the number of grouping keys. diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 6b0303ce52..1e6b278d21 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -615,39 +615,39 @@ SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b) -> Sort Sort Key: (COALESCE(prt1.a, p2.a)), (COALESCE(prt1.b, p2.b)) -> Merge Full Join - Merge Cond: ((prt1.a = p2.a) AND (prt1.b = p2.b)) + Merge Cond: ((prt1.b = p2.b) AND (prt1.a = p2.a)) Filter: ((COALESCE(prt1.a, p2.a) >= 490) AND (COALESCE(prt1.a, p2.a) <= 510)) -> Sort - Sort Key: prt1.a, prt1.b + Sort Key: prt1.b, prt1.a -> Seq Scan on prt1_p1 prt1 -> Sort - Sort Key: p2.a, p2.b + Sort Key: p2.b, p2.a -> Seq Scan on prt2_p1 p2 -> Group Group Key: (COALESCE(prt1_1.a, p2_1.a)), (COALESCE(prt1_1.b, p2_1.b)) -> Sort Sort Key: (COALESCE(prt1_1.a, p2_1.a)), (COALESCE(prt1_1.b, p2_1.b)) -> Merge Full Join - Merge Cond: ((prt1_1.a = p2_1.a) AND (prt1_1.b = p2_1.b)) + Merge Cond: ((prt1_1.b = p2_1.b) AND (prt1_1.a = p2_1.a)) Filter: ((COALESCE(prt1_1.a, p2_1.a) >= 490) AND (COALESCE(prt1_1.a, p2_1.a) <= 510)) -> Sort - Sort Key: prt1_1.a, prt1_1.b + Sort Key: prt1_1.b, prt1_1.a -> Seq Scan on prt1_p2 prt1_1 -> Sort - Sort Key: p2_1.a, p2_1.b + Sort Key: p2_1.b, p2_1.a -> Seq Scan on prt2_p2 p2_1 -> Group Group Key: (COALESCE(prt1_2.a, p2_2.a)), (COALESCE(prt1_2.b, p2_2.b)) -> Sort Sort Key: (COALESCE(prt1_2.a, p2_2.a)), (COALESCE(prt1_2.b, p2_2.b)) -> Merge Full Join - Merge Cond: ((prt1_2.a = p2_2.a) AND (prt1_2.b = p2_2.b)) + Merge Cond: ((prt1_2.b = p2_2.b) AND (prt1_2.a = p2_2.a)) Filter: ((COALESCE(prt1_2.a, p2_2.a) >= 490) AND (COALESCE(prt1_2.a, p2_2.a) <= 510)) -> Sort - Sort Key: prt1_2.a, prt1_2.b + Sort Key: prt1_2.b, prt1_2.a -> Seq Scan on prt1_p3 prt1_2 -> Sort - Sort Key: p2_2.a, p2_2.b + Sort Key: p2_2.b, p2_2.a -> Seq Scan on prt2_p3 p2_2 (43 rows) -- 2.47.0