From 6e41a1b2a175f7e9a859429e57c2ffb17ec9051d Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 31 Mar 2025 14:53:11 -0500 Subject: [PATCH v12n 2/3] pg_dump: Reduce memory usage of dumps with statistics. Right now, pg_dump stores all generated commands for statistics in memory. These commands can be quite large and therefore can significantly increase pg_dump's memory footprint. To fix, wait until we are about to write out the commands before generating them, and be sure to free the commands after writing. This is implemented via a new defnDumper callback that works much like the dataDumper one but is specially designed for TOC entries. One drawback of this change is that custom dumps that include data will run the statistics queries twice. However, a follow-up commit will add batching for these queries that our testing indicates should greatly improve dump speed (even when compared to pg_dump without this commit). Author: Corey Huinker Discussion: https://postgr.es/m/CADkLM%3Dc%2Br05srPy9w%2B-%2BnbmLEo15dKXYQ03Q_xyK%2BriJerigLQ%40mail.gmail.com --- src/bin/pg_dump/pg_backup.h | 1 + src/bin/pg_dump/pg_backup_archiver.c | 35 ++++++++++++++++++---- src/bin/pg_dump/pg_backup_archiver.h | 5 ++++ src/bin/pg_dump/pg_dump.c | 45 ++++++++++++++++++++-------- 4 files changed, 69 insertions(+), 17 deletions(-) diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 658986de6f8..781f8fa1cc9 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -285,6 +285,7 @@ typedef int DumpId; * Function pointer prototypes for assorted callback methods. */ +typedef char *(*DefnDumperPtr) (Archive *AH, const void *userArg); typedef int (*DataDumperPtr) (Archive *AH, const void *userArg); typedef void (*SetupWorkerPtrType) (Archive *AH); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 1d131e5a57d..4b73749b4e4 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -1266,6 +1266,9 @@ ArchiveEntry(Archive *AHX, CatalogId catalogId, DumpId dumpId, newToc->dataDumperArg = opts->dumpArg; newToc->hadDumper = opts->dumpFn ? true : false; + newToc->defnDumper = opts->defnFn; + newToc->defnDumperArg = opts->defnArg; + newToc->formatData = NULL; newToc->dataLength = 0; @@ -2621,7 +2624,17 @@ WriteToc(ArchiveHandle *AH) WriteStr(AH, te->tag); WriteStr(AH, te->desc); WriteInt(AH, te->section); - WriteStr(AH, te->defn); + + if (te->defnDumper) + { + char *defn = te->defnDumper((Archive *) AH, te->defnDumperArg); + + WriteStr(AH, defn); + pg_free(defn); + } + else + WriteStr(AH, te->defn); + WriteStr(AH, te->dropStmt); WriteStr(AH, te->copyStmt); WriteStr(AH, te->namespace); @@ -3849,7 +3862,7 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx) /* * Actually print the definition. Normally we can just print the defn - * string if any, but we have three special cases: + * string if any, but we have four special cases: * * 1. A crude hack for suppressing AUTHORIZATION clause that old pg_dump * versions put into CREATE SCHEMA. Don't mutate the variant for schema @@ -3862,6 +3875,10 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx) * 3. ACL LARGE OBJECTS entries need special processing because they * contain only one copy of the ACL GRANT/REVOKE commands, which we must * apply to each large object listed in the associated BLOB METADATA. + * + * 4. Entries with a defnDumper need to call it to generate the + * definition. This is primarily intended to provide a way to save memory + * for objects that need a lot of it (e.g., statistics data). */ if (ropt->noOwner && strcmp(te->desc, "SCHEMA") == 0 && strncmp(te->defn, "--", 2) != 0) @@ -3877,9 +3894,14 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx) { IssueACLPerBlob(AH, te); } - else if (te->defn && strlen(te->defn) > 0) + else if (te->defnDumper || (te->defn && strlen(te->defn) > 0)) { - ahprintf(AH, "%s\n\n", te->defn); + char *defn = te->defn; + + if (te->defnDumper) + defn = te->defnDumper((Archive *) AH, te->defnDumperArg); + + ahprintf(AH, "%s\n\n", defn); /* * If the defn string contains multiple SQL commands, txn_size mode @@ -3892,7 +3914,7 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx) strcmp(te->desc, "FUNCTION") != 0 && strcmp(te->desc, "PROCEDURE") != 0) { - const char *p = te->defn; + const char *p = defn; int nsemis = 0; while ((p = strchr(p, ';')) != NULL) @@ -3903,6 +3925,9 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, const char *pfx) if (nsemis > 1) AH->txnCount += nsemis - 1; } + + if (te->defnDumper) + pg_free(defn); } /* diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index a2064f471ed..fc65e0e34d3 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -368,6 +368,9 @@ struct _tocEntry const void *dataDumperArg; /* Arg for above routine */ void *formatData; /* TOC Entry data specific to file format */ + DefnDumperPtr defnDumper; /* Routine to dump create statement */ + const void *defnDumperArg; /* Arg for above routine */ + /* working state while dumping/restoring */ pgoff_t dataLength; /* item's data size; 0 if none or unknown */ int reqs; /* do we need schema and/or data of object @@ -407,6 +410,8 @@ typedef struct _archiveOpts int nDeps; DataDumperPtr dumpFn; const void *dumpArg; + DefnDumperPtr defnFn; + const void *defnArg; } ArchiveOpts; #define ARCHIVE_OPTS(...) &(ArchiveOpts){__VA_ARGS__} /* Called to add a TOC entry */ diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 4ca34be230c..9fa2cb0672e 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -10560,13 +10560,17 @@ appendNamedArgument(PQExpBuffer out, Archive *fout, const char *argname, } /* - * dumpRelationStats -- + * dumpRelationStats_dumper -- + * + * Generate command to import stats into the relation on the new database. * - * Dump command to import stats into the relation on the new database. + * This routine is called by the Archiver when it wants the statistics to be + * dumped. */ -static void -dumpRelationStats(Archive *fout, const RelStatsInfo *rsinfo) +static char * +dumpRelationStats_dumper(Archive *fout, const void *userArg) { + const RelStatsInfo *rsinfo = (RelStatsInfo *) userArg; const DumpableObject *dobj = &rsinfo->dobj; PGresult *res; PQExpBuffer query; @@ -10586,10 +10590,7 @@ dumpRelationStats(Archive *fout, const RelStatsInfo *rsinfo) int i_range_length_histogram; int i_range_empty_frac; int i_range_bounds_histogram; - - /* nothing to do if we are not dumping statistics */ - if (!fout->dopt->dumpStatistics) - return; + char *ret; query = createPQExpBuffer(); if (!fout->is_prepared[PREPQUERY_GETATTRIBUTESTATS]) @@ -10770,17 +10771,37 @@ dumpRelationStats(Archive *fout, const RelStatsInfo *rsinfo) PQclear(res); + destroyPQExpBuffer(query); + ret = out->data; + pg_free(out); + return ret; +} + +/* + * dumpRelationStats -- + * + * Make an ArchiveEntry for the relation statistics. The Archiver will take + * care of gathering the statistics and generating the restore commands when + * they are needed. + */ +static void +dumpRelationStats(Archive *fout, const RelStatsInfo *rsinfo) +{ + const DumpableObject *dobj = &rsinfo->dobj; + + /* nothing to do if we are not dumping statistics */ + if (!fout->dopt->dumpStatistics) + return; + ArchiveEntry(fout, nilCatalogId, createDumpId(), ARCHIVE_OPTS(.tag = dobj->name, .namespace = dobj->namespace->dobj.name, .description = "STATISTICS DATA", .section = rsinfo->section, - .createStmt = out->data, + .defnFn = dumpRelationStats_dumper, + .defnArg = rsinfo, .deps = dobj->dependencies, .nDeps = dobj->nDeps)); - - destroyPQExpBuffer(out); - destroyPQExpBuffer(query); } /* -- 2.39.5 (Apple Git-154)