From 2b52c286dd8ed1c8cabaa18a579183461e47f3af Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 16 May 2023 19:05:44 -0400
Subject: [PATCH v6 2/5] Postpone adding pushed down ojrelid to a join's
 relids.

When we apply outer join identity 3 in the forward direction, don't
immediately add the now-lower join's relid to the relid set of the
join.  This represents the fact that values from the "C" relation
(the join's RHS) aren't going to be fully correct until we perform
the other, commuted join.

Per report from Andrey Lepikhov.  Thanks to Richard Guo for
investigation and testing.

Discussion: https://postgr.es/m/0b819232-4b50-f245-1c7d-c8c61bf41827@postgrespro.ru
---
 src/backend/optimizer/README              | 21 ++++++
 src/backend/optimizer/path/equivclass.c   | 20 +++---
 src/backend/optimizer/path/indxpath.c     |  2 +-
 src/backend/optimizer/path/joinrels.c     | 85 +++++++++++++++++++++--
 src/backend/optimizer/plan/analyzejoins.c | 13 ++--
 src/backend/optimizer/util/relnode.c      |  9 ++-
 src/include/optimizer/paths.h             |  4 +-
 7 files changed, 127 insertions(+), 27 deletions(-)

diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README
index 227278eb6c..d017f3555a 100644
--- a/src/backend/optimizer/README
+++ b/src/backend/optimizer/README
@@ -410,6 +410,8 @@ joins.  However, when building Vars representing the outputs of join
 relations, we need to ensure that their varnullingrels are set to
 values consistent with the syntactic join order, so that they will
 appear equal() to pre-existing Vars in the upper part of the query.
+(We also have to be careful about where we place qual clauses using
+such Vars, as described below.)
 
 Outer joins also complicate handling of subquery pull-up.  Consider
 
@@ -507,6 +509,25 @@ problem for join relation identification either, since whether a semijoin
 has been completed is again implicit in the set of base relations
 included in the join.
 
+As usual, outer join identity 3 complicates matters.  If we start with
+	(A leftjoin B on (Pab)) leftjoin C on (Pbc)
+then the parser will have marked any C Vars appearing above these joins
+with the RT index of the B/C join.  If we now transform to
+	A leftjoin (B leftjoin C on (Pbc)) on (Pab)
+then it would appear that a clause using only such Vars could be pushed
+down and applied as a filter clause (not a join clause) at the lower
+B/C join.  But *this might not give the right answer* since the clause
+might see a non-null value for the C Var that will be replaced by null
+once the A/B join is performed.  We handle this by saying that the
+pushed-down join hasn't completely performed the work of the B/C join
+and hence is not entitled to include that outer join relid in its
+relid set.  When we form the A/B join, both outer joins' relids will
+be added to its relid set, and then the upper clause will be applied
+at the correct join level.  (Note there is no problem when identity 3
+is applied in the other direction: if we started with the second form
+then upper C Vars are marked with both outer join relids, so they
+cannot drop below whichever join is applied second.)
+
 There is one additional complication for qual clause placement, which
 occurs when we have made multiple versions of an outer-join clause as
 described previously (that is, we have both "Pbc" and "Pb*c" forms of
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index 9949d5b1d3..ecb1343d1a 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -1366,19 +1366,20 @@ generate_base_implied_equalities_broken(PlannerInfo *root,
  * commutative duplicates, i.e. if the algorithm selects "a.x = b.y" but
  * we already have "b.y = a.x", we return the existing clause.
  *
- * If we are considering an outer join, ojrelid is the associated OJ relid,
- * otherwise it's zero.
+ * If we are considering an outer join, sjinfo is the associated OJ info,
+ * otherwise it can be NULL.
  *
  * join_relids should always equal bms_union(outer_relids, inner_rel->relids)
- * plus ojrelid if that's not zero.  We could simplify this function's API by
- * computing it internally, but most callers have the value at hand anyway.
+ * plus whatever add_outer_joins_to_relids() would add.  We could simplify
+ * this function's API by computing it internally, but most callers have the
+ * value at hand anyway.
  */
 List *
 generate_join_implied_equalities(PlannerInfo *root,
 								 Relids join_relids,
 								 Relids outer_relids,
 								 RelOptInfo *inner_rel,
-								 Index ojrelid)
+								 SpecialJoinInfo *sjinfo)
 {
 	List	   *result = NIL;
 	Relids		inner_relids = inner_rel->relids;
@@ -1396,8 +1397,9 @@ generate_join_implied_equalities(PlannerInfo *root,
 		nominal_inner_relids = inner_rel->top_parent_relids;
 		/* ECs will be marked with the parent's relid, not the child's */
 		nominal_join_relids = bms_union(outer_relids, nominal_inner_relids);
-		if (ojrelid != 0)
-			nominal_join_relids = bms_add_member(nominal_join_relids, ojrelid);
+		nominal_join_relids = add_outer_joins_to_relids(root,
+														nominal_join_relids,
+														sjinfo);
 	}
 	else
 	{
@@ -1418,7 +1420,7 @@ generate_join_implied_equalities(PlannerInfo *root,
 	 * At inner joins, we can be smarter: only consider eclasses mentioning
 	 * both input rels.
 	 */
-	if (ojrelid != 0)
+	if (sjinfo && sjinfo->ojrelid != 0)
 		matching_ecs = get_eclass_indexes_for_relids(root, nominal_join_relids);
 	else
 		matching_ecs = get_common_eclass_indexes(root, nominal_inner_relids,
@@ -1467,7 +1469,7 @@ generate_join_implied_equalities(PlannerInfo *root,
  * generate_join_implied_equalities_for_ecs
  *	  As above, but consider only the listed ECs.
  *
- * For the sole current caller, we can assume ojrelid == 0, that is we are
+ * For the sole current caller, we can assume sjinfo == NULL, that is we are
  * not interested in outer-join filter clauses.  This might need to change
  * in future.
  */
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 9f4698f2a2..1436dbc2f2 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3364,7 +3364,7 @@ check_index_predicates(PlannerInfo *root, RelOptInfo *rel)
 																   otherrels),
 														 otherrels,
 														 rel,
-														 0));
+														 NULL));
 
 	/*
 	 * Normally we remove quals that are implied by a partial index's
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index c0d9c50270..262d3aec65 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -710,9 +710,8 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2)
 		return NULL;
 	}
 
-	/* If we have an outer join, add its RTI to form the canonical relids. */
-	if (sjinfo && sjinfo->ojrelid != 0)
-		joinrelids = bms_add_member(joinrelids, sjinfo->ojrelid);
+	/* Add outer join relid(s) to form the canonical relids. */
+	joinrelids = add_outer_joins_to_relids(root, joinrelids, sjinfo);
 
 	/* Swap rels if needed to match the join info. */
 	if (reversed)
@@ -776,6 +775,81 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2)
 	return joinrel;
 }
 
+/*
+ * add_outer_joins_to_relids
+ *	  Add relids to input_relids to represent any outer joins that will be
+ *	  calculated at this join.
+ *
+ * input_relids is the union of the relid sets of the two input relations.
+ * Note that we modify this in-place and return it; caller must bms_copy()
+ * it first, if a separate value is desired.
+ *
+ * sjinfo represents the join being performed.
+ */
+Relids
+add_outer_joins_to_relids(PlannerInfo *root, Relids input_relids,
+						  SpecialJoinInfo *sjinfo)
+{
+	/* Nothing to do if this isn't an outer join with an assigned relid. */
+	if (sjinfo == NULL || sjinfo->ojrelid == 0)
+		return input_relids;
+
+	/*
+	 * If it's not a left join, we have no rules that would permit executing
+	 * it in non-syntactic order, so just form the syntactic relid set.  (This
+	 * is just a quick-exit test; we'd come to the same conclusion anyway,
+	 * since its commute_below_l and commute_above_l sets must be empty.)
+	 */
+	if (sjinfo->jointype != JOIN_LEFT)
+		return bms_add_member(input_relids, sjinfo->ojrelid);
+
+	/*
+	 * Add the OJ relid unless this join has been pushed into the RHS of a
+	 * syntactically-lower left join per OJ identity 3.  (If it has, then we
+	 * cannot claim that its outputs represent the final state of its RHS.)
+	 */
+	if (bms_is_subset(sjinfo->commute_below_l, input_relids))
+		input_relids = bms_add_member(input_relids, sjinfo->ojrelid);
+
+	/*
+	 * Contrariwise, if we are now forming the final result of such a commuted
+	 * pair of OJs, it's time to add the relid(s) of the pushed-down join(s).
+	 * We can skip this if this join was never a candidate to be pushed up.
+	 */
+	if (sjinfo->commute_above_l)
+	{
+		ListCell   *lc;
+
+		/*
+		 * The current join could complete the nulling of more than one
+		 * pushed-down join, so we have to examine all the SpecialJoinInfos.
+		 * Because join_info_list was built in bottom-up order, it's
+		 * sufficient to traverse it once: an ojrelid we add in one loop
+		 * iteration would not have affected decisions of earlier iterations.
+		 */
+		foreach(lc, root->join_info_list)
+		{
+			SpecialJoinInfo *othersj = (SpecialJoinInfo *) lfirst(lc);
+
+			if (othersj == sjinfo ||
+				othersj->ojrelid == 0 || othersj->jointype != JOIN_LEFT)
+				continue;		/* definitely not interesting */
+
+			if (othersj->commute_below_l == NULL)
+				continue;		/* was never a candidate to be pushed down */
+
+			/* Add it if not already present but conditions now satisfied */
+			if (!bms_is_member(othersj->ojrelid, input_relids) &&
+				bms_is_subset(othersj->min_lefthand, input_relids) &&
+				bms_is_subset(othersj->min_righthand, input_relids) &&
+				bms_is_subset(othersj->commute_below_l, input_relids))
+				input_relids = bms_add_member(input_relids, othersj->ojrelid);
+		}
+	}
+
+	return input_relids;
+}
+
 /*
  * populate_joinrel_with_paths
  *	  Add paths to the given joinrel for given pair of joining relations. The
@@ -1535,9 +1609,8 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
 
 		/* Build correct join relids for child join */
 		child_joinrelids = bms_union(child_rel1->relids, child_rel2->relids);
-		if (child_sjinfo->ojrelid != 0)
-			child_joinrelids = bms_add_member(child_joinrelids,
-											  child_sjinfo->ojrelid);
+		child_joinrelids = add_outer_joins_to_relids(root, child_joinrelids,
+													 child_sjinfo);
 
 		/* Find the AppendRelInfo structures */
 		appinfos = find_appinfos_by_relids(root, child_joinrelids, &nappinfos);
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 1116ad978e..7463b3696a 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -88,8 +88,11 @@ restart:
 		 */
 		innerrelid = bms_singleton_member(sjinfo->min_righthand);
 
-		/* Compute the relid set for the join we are considering */
-		joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
+		/*
+		 * Compute the relid set for the join we are considering.  We can
+		 * assume things are done in syntactic order.
+		 */
+		joinrelids = bms_union(sjinfo->syn_lefthand, sjinfo->syn_righthand);
 		if (sjinfo->ojrelid != 0)
 			joinrelids = bms_add_member(joinrelids, sjinfo->ojrelid);
 
@@ -201,8 +204,8 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	if (!rel_supports_distinctness(root, innerrel))
 		return false;
 
-	/* Compute the relid set for the join we are considering */
-	inputrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
+	/* Compute the syntactic relid set for the join we are considering */
+	inputrelids = bms_union(sjinfo->syn_lefthand, sjinfo->syn_righthand);
 	Assert(sjinfo->ojrelid != 0);
 	joinrelids = bms_copy(inputrelids);
 	joinrelids = bms_add_member(joinrelids, sjinfo->ojrelid);
@@ -666,7 +669,7 @@ reduce_unique_semijoins(PlannerInfo *root)
 														 joinrelids,
 														 sjinfo->min_lefthand,
 														 innerrel,
-														 0),
+														 NULL),
 						innerrel->joininfo);
 
 		/* Test whether the innerrel is unique for those clauses. */
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 68fd033595..0d849d9494 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -870,8 +870,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
 
 	joinrel->reloptkind = RELOPT_OTHER_JOINREL;
 	joinrel->relids = bms_union(outer_rel->relids, inner_rel->relids);
-	if (sjinfo->ojrelid != 0)
-		joinrel->relids = bms_add_member(joinrel->relids, sjinfo->ojrelid);
+	joinrel->relids = add_outer_joins_to_relids(root, joinrel->relids, sjinfo);
 	joinrel->rows = 0;
 	/* cheap startup cost is interesting iff not all tuples to be retrieved */
 	joinrel->consider_startup = (root->tuple_fraction > 0);
@@ -1259,7 +1258,7 @@ build_joinrel_restrictlist(PlannerInfo *root,
 														  joinrel->relids,
 														  outer_rel->relids,
 														  inner_rel,
-														  sjinfo->ojrelid));
+														  sjinfo));
 
 	return result;
 }
@@ -1543,7 +1542,7 @@ get_baserel_parampathinfo(PlannerInfo *root, RelOptInfo *baserel,
 															joinrelids,
 															required_outer,
 															baserel,
-															0));
+															NULL));
 
 	/* Compute set of serial numbers of the enforced clauses */
 	pserials = NULL;
@@ -1666,7 +1665,7 @@ get_joinrel_parampathinfo(PlannerInfo *root, RelOptInfo *joinrel,
 												join_and_req,
 												required_outer,
 												joinrel,
-												0);
+												NULL);
 	/* We only want ones that aren't movable to lower levels */
 	dropped_ecs = NIL;
 	foreach(lc, eclauses)
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 5b9db7733d..d9e1623274 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -104,6 +104,8 @@ extern void add_paths_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel,
 extern void join_search_one_level(PlannerInfo *root, int level);
 extern RelOptInfo *make_join_rel(PlannerInfo *root,
 								 RelOptInfo *rel1, RelOptInfo *rel2);
+extern Relids add_outer_joins_to_relids(PlannerInfo *root, Relids input_relids,
+										SpecialJoinInfo *sjinfo);
 extern bool have_join_order_restriction(PlannerInfo *root,
 										RelOptInfo *rel1, RelOptInfo *rel2);
 extern bool have_dangerous_phv(PlannerInfo *root,
@@ -150,7 +152,7 @@ extern List *generate_join_implied_equalities(PlannerInfo *root,
 											  Relids join_relids,
 											  Relids outer_relids,
 											  RelOptInfo *inner_rel,
-											  Index ojrelid);
+											  SpecialJoinInfo *sjinfo);
 extern List *generate_join_implied_equalities_for_ecs(PlannerInfo *root,
 													  List *eclasses,
 													  Relids join_relids,
-- 
2.31.1

