--- a/src/backend/utils/adt/selfuncs.c 2019-08-06 01:14:59.000000000 +0400 +++ b/src/backend/utils/adt/selfuncs.c 2019-09-06 08:23:34.932978863 +0400 @@ -161,7 +161,7 @@ static double var_eq_const(VariableStatData *vardata, Oid operator, Datum constval, bool constisnull, bool varonleft, bool negate); -static double var_eq_non_const(VariableStatData *vardata, Oid operator, +static double var_eq_non_const(VariableStatData *vardata, PlannerInfo *root, List *args, Oid operator, Node *other, bool varonleft, bool negate); static double ineq_histogram_selectivity(PlannerInfo *root, @@ -291,7 +291,7 @@ ((Const *) other)->constisnull, varonleft, negate); else - selec = var_eq_non_const(&vardata, operator, other, + selec = var_eq_non_const(&vardata, root, args, operator, other, /* root, args for detailed statistics of variable on the other hand */ varonleft, negate); ReleaseVariableStats(vardata); @@ -453,11 +453,161 @@ return selec; } +/* var_eq_var_mcvs - selectivity of "var = var2" using MCV lists + * based on eqjoinsel_inner() code + */ +static bool +var_eq_var_mcvs(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2, + AttStatsSlot *sslot1, double nullfrac1, + double nd1, double *selec) +{ + Oid opfuncoid; + Form_pg_statistic stats2 = NULL; + bool have_mcvs2 = false; + AttStatsSlot sslot2; + bool result = false; /* managed to use of the MCV lists */ + + opfuncoid = get_opcode(operator); + + memset(&sslot2, 0, sizeof(sslot2)); + + if (HeapTupleIsValid(vardata2->statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); + if (statistic_proc_security_check(vardata2, opfuncoid)) + have_mcvs2 = get_attstatsslot(&sslot2, vardata2->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (have_mcvs2 && stats2->stanullfrac < 1.0 && sslot2.nvalues > 0) /* we use MCV list for selectivity estimate if var2 has MCV statistics, has any not null value and MCV list is not empty */ + { + /* + * We have most-common-value lists for both relations. Run through + * the lists to see which MCVs actually join to each other with the + * given operator. This allows us to determine the exact join + * selectivity for the portion of the relations represented by the MCV + * lists. We still have to estimate for the remaining population, but + * in a skewed distribution this gives us a big leg up in accuracy. + * For motivation see the analysis in Y. Ioannidis and S. + * Christodoulakis, "On the propagation of errors in the size of join + * results", Technical Report 1018, Computer Science Dept., University + * of Wisconsin, Madison, March 1992 (available from ftp.cs.wisc.edu). + */ + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + /*double nullfrac1 = stats1->stanullfrac;*/ + double nullfrac2 = stats2->stanullfrac; + double matchprodfreq, + matchfreq1, + matchfreq2, + unmatchfreq1, + unmatchfreq2, + otherfreq1, + otherfreq2, + totalsel2; + int i, + nmatches; + + fmgr_info(opfuncoid, &eqproc); + hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(sslot2.nvalues * sizeof(bool)); + + /* + * Note we assume that each MCV will match at most one member of the + * other MCV list. If the operator isn't really equality, there could + * be multiple matches --- but we don't look for them, both for speed + * and because the math wouldn't add up... + */ + matchprodfreq = 0.0; + nmatches = 0; + for (i = 0; i < sslot1->nvalues; i++) + { + int j; + + for (j = 0; j < sslot2.nvalues; j++) + { + if (hasmatch2[j]) + continue; + if (DatumGetBool(FunctionCall2Coll(&eqproc, + DEFAULT_COLLATION_OID, + sslot1->values[i], + sslot2.values[j]))) + { + hasmatch1[i] = hasmatch2[j] = true; + matchprodfreq += sslot1->numbers[i] * sslot2.numbers[j]; + nmatches++; + break; + } + } + } + CLAMP_PROBABILITY(matchprodfreq); + /* Sum up frequencies of matched and unmatched MCVs */ + matchfreq1 = unmatchfreq1 = 0.0; + for (i = 0; i < sslot1->nvalues; i++) + { + if (hasmatch1[i]) + matchfreq1 += sslot1->numbers[i]; + else + unmatchfreq1 += sslot1->numbers[i]; + } + CLAMP_PROBABILITY(matchfreq1); + CLAMP_PROBABILITY(unmatchfreq1); + matchfreq2 = unmatchfreq2 = 0.0; + for (i = 0; i < sslot2.nvalues; i++) + { + if (hasmatch2[i]) + matchfreq2 += sslot2.numbers[i]; + else + unmatchfreq2 += sslot2.numbers[i]; + } + CLAMP_PROBABILITY(matchfreq2); + CLAMP_PROBABILITY(unmatchfreq2); + pfree(hasmatch1); + pfree(hasmatch2); + + /* + * Compute total frequency of non-null values that are not in the MCV + * lists. + */ + otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1; + otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2; + CLAMP_PROBABILITY(otherfreq1); + CLAMP_PROBABILITY(otherfreq2); + + /* + * We can estimate the total selectivity from the point of view of + * relation 1 as: the known selectivity for matched MCVs, plus + * unmatched MCVs that are assumed to match against random members of + * relation 2's non-MCV population, plus non-MCV values that are + * assumed to match against random members of relation 2's unmatched + * MCVs plus non-MCV values. + */ + /* Same estimate from the point of view of relation 2. */ + totalsel2 = matchprodfreq; + if (nd1 > sslot1->nvalues) + totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - sslot1->nvalues); + if (nd1 > nmatches) + totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / + (nd1 - nmatches); + + *selec = totalsel2/(1-nullfrac2); + result = true; + + } + + free_attstatsslot(&sslot2); + + return result; +} + /* * var_eq_non_const --- eqsel for var = something-other-than-const case */ static double -var_eq_non_const(VariableStatData *vardata, Oid operator, +var_eq_non_const(VariableStatData *vardata, PlannerInfo *root, List *args, Oid operator, Node *other, bool varonleft, bool negate) { @@ -513,8 +663,20 @@ */ if (get_attstatsslot(&sslot, vardata->statsTuple, STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | /* need MCV list */ ATTSTATSSLOT_NUMBERS)) { + if (sslot.nvalues > 0) /* only if MCV list is not empty */ + { + Node *node2; + VariableStatData vardata2; + + node2 = varonleft ? lsecond(args) : linitial(args); /* variable2 is on the other side */ + examine_variable(root, node2, 0, &vardata2); /* data of the other side */ + if (vardata2.rel) /* rel from the other side */ + var_eq_var_mcvs(operator, vardata, &vardata2, &sslot, nullfrac, ndistinct, &selec); /* trying to estimate the selectivity more accurate using the MCV lists */ + ReleaseVariableStats(vardata2); + } if (sslot.nnumbers > 0 && selec > sslot.numbers[0]) selec = sslot.numbers[0]; free_attstatsslot(&sslot);