Re: problem (bug?) with "in (subquery)"

From: Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us>
To: Luca Pireddu <lucap(at)shaw(dot)ca>
Cc: pgsql-sql(at)postgresql(dot)org, Michael Fuhr <mike(at)fuhr(dot)org>
Subject: Re: problem (bug?) with "in (subquery)"
Date: 2005-07-15 17:12:45
Message-ID: 16528.1121447565@sss.pgh.pa.us
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-sql

Luca Pireddu <lucap(at)shaw(dot)ca> writes:
> On July 15, 2005 08:58, Tom Lane wrote:
>> Ah-hah: this one is the fault of create_unique_path, which quoth

> In any case, it looks like Tom has already found the problem :-) Thanks guys!

On closer analysis, the test in create_unique_path is almost but not
quite completely wrong :-(. Here is the patch against 8.0 branch,
if you need it right away.

regards, tom lane

Index: src/backend/optimizer/util/pathnode.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v
retrieving revision 1.111
diff -c -r1.111 pathnode.c
*** src/backend/optimizer/util/pathnode.c 31 Dec 2004 22:00:23 -0000 1.111
--- src/backend/optimizer/util/pathnode.c 15 Jul 2005 17:03:06 -0000
***************
*** 34,40 ****
#include "utils/syscache.h"


! static bool is_distinct_query(Query *query);
static bool hash_safe_tlist(List *tlist);


--- 34,41 ----
#include "utils/syscache.h"


! static List *translate_sub_tlist(List *tlist, int relid);
! static bool query_is_distinct_for(Query *query, List *colnos);
static bool hash_safe_tlist(List *tlist);


***************
*** 642,655 ****
pathnode->subpath = subpath;

/*
! * If the input is a subquery whose output must be unique already, we
! * don't need to do anything.
*/
! if (rel->rtekind == RTE_SUBQUERY)
{
RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable);

! if (is_distinct_query(rte->subquery))
{
pathnode->umethod = UNIQUE_PATH_NOOP;
pathnode->rows = rel->rows;
--- 643,683 ----
pathnode->subpath = subpath;

/*
! * Try to identify the targetlist that will actually be unique-ified.
! * In current usage, this routine is only used for sub-selects of IN
! * clauses, so we should be able to find the tlist in in_info_list.
! */
! sub_targetlist = NIL;
! foreach(l, root->in_info_list)
! {
! InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
!
! if (bms_equal(ininfo->righthand, rel->relids))
! {
! sub_targetlist = ininfo->sub_targetlist;
! break;
! }
! }
!
! /*
! * If the input is a subquery whose output must be unique already,
! * then we don't need to do anything. The test for uniqueness has
! * to consider exactly which columns we are extracting; for example
! * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
! * So we cannot check for this optimization unless we found our own
! * targetlist above, and it consists only of simple Vars referencing
! * subquery outputs. (Possibly we could do something with expressions
! * in the subquery outputs, too, but for now keep it simple.)
*/
! if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
{
RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable);
+ List *sub_tlist_colnos;

! sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
!
! if (sub_tlist_colnos &&
! query_is_distinct_for(rte->subquery, sub_tlist_colnos))
{
pathnode->umethod = UNIQUE_PATH_NOOP;
pathnode->rows = rel->rows;
***************
*** 664,686 ****
}

/*
- * Try to identify the targetlist that will actually be unique-ified.
- * In current usage, this routine is only used for sub-selects of IN
- * clauses, so we should be able to find the tlist in in_info_list.
- */
- sub_targetlist = NIL;
- foreach(l, root->in_info_list)
- {
- InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
- if (bms_equal(ininfo->righthand, rel->relids))
- {
- sub_targetlist = ininfo->sub_targetlist;
- break;
- }
- }
-
- /*
* If we know the targetlist, try to estimate number of result rows;
* otherwise punt.
*/
--- 692,697 ----
***************
*** 755,804 ****
}

/*
! * is_distinct_query - does query never return duplicate rows?
*/
! static bool
! is_distinct_query(Query *query)
{
! /* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
! if (has_distinct_clause(query))
! return true;

! /* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
! if (query->setOperations)
{
! SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;

! Assert(IsA(topop, SetOperationStmt));
! Assert(topop->op != SETOP_NONE);

! if (!topop->all)
return true;
}

/*
! * GROUP BY guarantees uniqueness if all the grouped columns appear in
! * the output. In our implementation this means checking they are non
! * resjunk columns.
*/
if (query->groupClause)
{
! ListCell *gl;
!
! foreach(gl, query->groupClause)
{
! GroupClause *grpcl = (GroupClause *) lfirst(gl);
TargetEntry *tle = get_sortgroupclause_tle(grpcl,
query->targetList);

! if (tle->resdom->resjunk)
! break;
}
! if (!gl) /* got to the end? */
return true;
}

/*
* XXX Are there any other cases in which we can easily see the result
* must be distinct?
*/
--- 766,888 ----
}

/*
! * translate_sub_tlist - get subquery column numbers represented by tlist
! *
! * The given targetlist should contain only Vars referencing the given relid.
! * Extract their varattnos (ie, the column numbers of the subquery) and return
! * as an integer List.
! *
! * If any of the tlist items is not a simple Var, we cannot determine whether
! * the subquery's uniqueness condition (if any) matches ours, so punt and
! * return NIL.
*/
! static List *
! translate_sub_tlist(List *tlist, int relid)
{
! List *result = NIL;
! ListCell *l;

! foreach(l, tlist)
{
! Var *var = (Var *) lfirst(l);

! if (!var || !IsA(var, Var) ||
! var->varno != relid)
! return NIL; /* punt */

! result = lappend_int(result, var->varattno);
! }
! return result;
! }
!
! /*
! * query_is_distinct_for - does query never return duplicates of the
! * specified columns?
! *
! * colnos is an integer list of output column numbers (resno's). We are
! * interested in whether rows consisting of just these columns are certain
! * to be distinct.
! */
! static bool
! query_is_distinct_for(Query *query, List *colnos)
! {
! ListCell *l;
!
! /*
! * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
! * columns in the DISTINCT clause appear in colnos.
! */
! if (query->distinctClause)
! {
! foreach(l, query->distinctClause)
! {
! SortClause *scl = (SortClause *) lfirst(l);
! TargetEntry *tle = get_sortgroupclause_tle(scl,
! query->targetList);
!
! if (!list_member_int(colnos, tle->resdom->resno))
! break; /* exit early if no match */
! }
! if (l == NULL) /* had matches for all? */
return true;
}

/*
! * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
! * appear in colnos.
*/
if (query->groupClause)
{
! foreach(l, query->groupClause)
{
! GroupClause *grpcl = (GroupClause *) lfirst(l);
TargetEntry *tle = get_sortgroupclause_tle(grpcl,
query->targetList);

! if (!list_member_int(colnos, tle->resdom->resno))
! break; /* exit early if no match */
}
! if (l == NULL) /* had matches for all? */
! return true;
! }
! else
! {
! /*
! * If we have no GROUP BY, but do have aggregates or HAVING, then
! * the result is at most one row so it's surely unique.
! */
! if (query->hasAggs || query->havingQual)
return true;
}

/*
+ * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
+ * except with ALL
+ */
+ if (query->setOperations)
+ {
+ SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+
+ Assert(IsA(topop, SetOperationStmt));
+ Assert(topop->op != SETOP_NONE);
+
+ if (!topop->all)
+ {
+ /* We're good if all the nonjunk output columns are in colnos */
+ foreach(l, query->targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+ if (!tle->resdom->resjunk &&
+ !list_member_int(colnos, tle->resdom->resno))
+ break; /* exit early if no match */
+ }
+ if (l == NULL) /* had matches for all? */
+ return true;
+ }
+ }
+
+ /*
* XXX Are there any other cases in which we can easily see the result
* must be distinct?
*/
Index: src/test/regress/expected/subselect.out
===================================================================
RCS file: /cvsroot/pgsql/src/test/regress/expected/subselect.out,v
retrieving revision 1.10.4.2
diff -c -r1.10.4.2 subselect.out
*** src/test/regress/expected/subselect.out 1 Feb 2005 23:09:00 -0000 1.10.4.2
--- src/test/regress/expected/subselect.out 15 Jul 2005 17:03:06 -0000
***************
*** 203,208 ****
--- 203,265 ----
(1 row)

--
+ -- Test cases to check for overenthusiastic optimization of
+ -- "IN (SELECT DISTINCT ...)" and related cases. Per example from
+ -- Luca Pireddu and Michael Fuhr.
+ --
+ CREATE TEMP TABLE foo (id integer);
+ CREATE TEMP TABLE bar (id1 integer, id2 integer);
+ INSERT INTO foo VALUES (1);
+ INSERT INTO bar VALUES (1, 1);
+ INSERT INTO bar VALUES (2, 2);
+ INSERT INTO bar VALUES (3, 1);
+ -- These cases require an extra level of distinct-ing above subquery s
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ id
+ ----
+ 1
+ (1 row)
+
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ id
+ ----
+ 1
+ (1 row)
+
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+ SELECT id1, id2 FROM bar) AS s);
+ id
+ ----
+ 1
+ (1 row)
+
+ -- These cases do not
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ id
+ ----
+ 1
+ (1 row)
+
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ id
+ ----
+ 1
+ (1 row)
+
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id2 FROM bar UNION
+ SELECT id2 FROM bar) AS s);
+ id
+ ----
+ 1
+ (1 row)
+
+ --
-- Test case to catch problems with multiply nested sub-SELECTs not getting
-- recalculated properly. Per bug report from Didier Moens.
--
Index: src/test/regress/sql/subselect.sql
===================================================================
RCS file: /cvsroot/pgsql/src/test/regress/sql/subselect.sql,v
retrieving revision 1.7
diff -c -r1.7 subselect.sql
*** src/test/regress/sql/subselect.sql 4 Oct 2004 14:42:48 -0000 1.7
--- src/test/regress/sql/subselect.sql 15 Jul 2005 17:03:06 -0000
***************
*** 96,101 ****
--- 96,134 ----
where unique1 IN (select distinct hundred from tenk1 b)) ss;

--
+ -- Test cases to check for overenthusiastic optimization of
+ -- "IN (SELECT DISTINCT ...)" and related cases. Per example from
+ -- Luca Pireddu and Michael Fuhr.
+ --
+
+ CREATE TEMP TABLE foo (id integer);
+ CREATE TEMP TABLE bar (id1 integer, id2 integer);
+
+ INSERT INTO foo VALUES (1);
+
+ INSERT INTO bar VALUES (1, 1);
+ INSERT INTO bar VALUES (2, 2);
+ INSERT INTO bar VALUES (3, 1);
+
+ -- These cases require an extra level of distinct-ing above subquery s
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+ SELECT id1, id2 FROM bar) AS s);
+
+ -- These cases do not
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ SELECT * FROM foo WHERE id IN
+ (SELECT id2 FROM (SELECT id2 FROM bar UNION
+ SELECT id2 FROM bar) AS s);
+
+ --
-- Test case to catch problems with multiply nested sub-SELECTs not getting
-- recalculated properly. Per bug report from Didier Moens.
--

In response to

Browse pgsql-sql by date

  From Date Subject
Next Message Devrim GUNDUZ 2005-07-15 18:03:34 Re: Postgres for Fedora Core 2 OS ****************
Previous Message Dianne Yumul 2005-07-15 16:57:08 Re: Postgres for Fedora Core 2 OS ****************