diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 64eec91f8b..667a23eed0 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -1653,10 +1653,54 @@ ExecHashTableInsert(HashJoinTable hashtable, hashtable->spaceUsed += hashTupleSize; if (hashtable->spaceUsed > hashtable->spacePeak) hashtable->spacePeak = hashtable->spaceUsed; + + /* + * Consider increasing number of batches. + * + * Each batch requires a non-trivial amount of memory, because BufFile + * includes a PGAlignedBlock (typically 8kB buffer). So when doubling + * the number of batches, we need to be careful and only allow that if + * it actually has a chance of reducing memory usage. + * + * In particular, doubling the number of batches is pointless when + * + * (spaceUsed / 2) < (nbatches * sizeof(BufFile)) + * + * because we expect to save roughly 1/2 of memory currently used for + * data (rows) at the price of doubling the memory used for BufFile. + * + * We can't stop adding batches entirely, because that would just mean + * the batches would need more and more memory. So we need to increase + * the number of batches, even if we can't enforce work_mem properly. + * The goal is to minimize the overall memory usage of the hash join. + * + * Note: This applies mostly to cases of significant underestimates, + * resulting in an explosion of the number of batches. The properly + * estimated cases should generally end up using merge join based on + * high cost of the batched hash join. + */ if (hashtable->spaceUsed + - hashtable->nbuckets_optimal * sizeof(HashJoinTuple) + hashtable->nbuckets_optimal * sizeof(HashJoinTuple) + + hashtable->nbatch * sizeof(PGAlignedBlock) > hashtable->spaceAllowed) + { ExecHashIncreaseNumBatches(hashtable); + + /* + * Consider increasing the resize threshold. For well estimated cases + * this does nothing, because batches are expected to account only for + * small fraction of work_mem. But if we significantly underestimate + * the number of batches, we may end up in a situation where BufFile + * alone exceed work_mem. So move the threshold a bit, until the next + * point where it'll make sense to consider adding batches again. + */ + hashtable->spaceAllowed + = Max(hashtable->spaceAllowed, + hashtable->nbatch * sizeof(PGAlignedBlock) * 3); + + elog(WARNING, "ExecHashIncreaseNumBatches: nbatch=%d spaceAllowed=%ld", + hashtable->nbatch, hashtable->spaceAllowed); + } } else { @@ -2661,6 +2705,8 @@ ExecHashGetInstrumentation(HashInstrumentation *instrument, instrument->nbatch = hashtable->nbatch; instrument->nbatch_original = hashtable->nbatch_original; instrument->space_peak = hashtable->spacePeak; + /* account for memory used for BufFile */ + instrument->space_peak += hashtable->nbatch * sizeof(PGAlignedBlock); } /*