diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0cc3296..a5c20b3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1685,6 +1685,7 @@ SET ENABLE_SEQSCAN TO OFF; data corruption, after a system failure. The risks are similar to turning off fsync, though smaller, and it should be turned off only based on the same circumstances recommended for that parameter. + This parameter must be on when page_cksum is on. @@ -1701,6 +1702,20 @@ SET ENABLE_SEQSCAN TO OFF; + + page_cksum (boolean) + + page_cksumconfiguration parameter + + + + When this parameter is on, the + PostgreSQL server writes and + checks checksums for each page written to persistent storage. + + + + wal_buffers (integer) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 963189d..0fa5f68 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -314,6 +314,9 @@ extern char *optarg; extern int optind, opterr; +extern int page_checksum; +extern bool fullPageWrites; + #ifdef HAVE_INT_OPTRESET extern int optreset; /* might not be declared by system headers */ #endif @@ -766,6 +769,29 @@ PostmasterMain(int argc, char *argv[]) (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"archive\" or \"hot_standby\""))); /* + * The idea here is that there will be checksum matches if there + * are partial writes to pages during hardware crashes. The user + * should have full_page_writes enabled if page_checksum is + * enabled so that these pages are automatically fixed, otherwise + * PostgreSQL may get checksum errors after crashes on pages that + * are in fact partially written and hence corrupted. With + * full_page_writes enabled, PostgreSQL will replace each page + * without ever looking at the partially-written page and seeing + * an incorrect checksum. Hence, checksums will detect only real + * disk corruptions, i.e. places where the disk reported a + * successful write but the data was still corrupted at some + * point. + * + * Alternatively, we may want to leave this check out. This would + * be for sophisticated users who have some other guarantee + * (hardware and/or software) against ever producing a partial + * write during crashes. + */ + if (page_checksum && !fullPageWrites) + ereport(ERROR, + (errmsg("full_page_writes must be enabled if page_checksum is enabled."))); + + /* * Other one-time internal sanity checks can go here, if they are fast. * (Put any slow processing further down, after postmaster.pid creation.) */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 4f607cd..1756d62 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -17,6 +17,7 @@ */ #include "postgres.h" +#include "catalog/catalog.h" #include "commands/tablespace.h" #include "storage/bufmgr.h" #include "storage/ipc.h" @@ -79,6 +80,12 @@ static const int NSmgr = lengthof(smgrsw); */ static HTAB *SMgrRelationHash = NULL; +/* Page checksumming. */ +static uint64 tempbuf[BLCKSZ/sizeof(uint64)]; +extern bool page_checksum; + +#define INVALID_CKSUM 0x1b0af034 + /* local function prototypes */ static void smgrshutdown(int code, Datum arg); @@ -381,6 +388,59 @@ smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool isRedo) } /* + * The initial value when computing the checksum for a data page. + */ +static inline uint64 +ChecksumInit(SMgrRelation reln, ForkNumber f, BlockNumber b) +{ + return b + f; +} + +/* + * Compute a checksum of a buffer (with length len), using initial + * value cksum. We use a relatively simple checksum calculation to + * avoid overhead, but could replace with some kind of CRC + * calculation. + */ +static inline uint32 +ComputeChecksum(uint64 *buffer, uint32 len, uint64 cksum) +{ + int i; + + for (i = 0; i < len/sizeof(uint64); i += 4) { + cksum += (cksum << 5) + *buffer; + cksum += (cksum << 5) + *(buffer+1); + cksum += (cksum << 5) + *(buffer+2); + cksum += (cksum << 5) + *(buffer+3); + buffer += 4; + } + cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32); + return cksum; +} + +/* + * Copy buffer to dst and compute the checksum during the copy (so + * that the checksum is correct for the final contents fo dst). + */ +static inline uint32 +CopyAndComputeChecksum(uint64 *dst, volatile uint64 *buffer, + uint32 len, uint64 cksum) +{ + int i; + + for (i = 0; i < len/sizeof(uint64); i += 4) { + cksum += (cksum << 5) + (*dst = *buffer); + cksum += (cksum << 5) + (*(dst+1) = *(buffer+1)); + cksum += (cksum << 5) + (*(dst+2) = *(buffer+2)); + cksum += (cksum << 5) + (*(dst+3) = *(buffer+3)); + dst += 4; + buffer += 4; + } + cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32); + return cksum; +} + +/* * smgrextend() -- Add a new block to a file. * * The semantics are nearly the same as smgrwrite(): write at the @@ -393,8 +453,25 @@ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { + PageHeader p; + Assert(PageGetPageLayoutVersion(((PageHeader)buffer)) == PG_PAGE_LAYOUT_VERSION || + PageIsNew(buffer)); + if (page_checksum) { + p = (PageHeader)tempbuf; + ((PageHeader)buffer)->cksum = 0; + /* + * We copy and compute the checksum, and then write out the + * data from the copy to avoid any problem with hint bits + * changing after we compute the checksum. + */ + p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, BLCKSZ, + ChecksumInit(reln, forknum, blocknum)); + } else { + p = (PageHeader)buffer; + p->cksum = INVALID_CKSUM; + } (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum, - buffer, skipFsync); + (char *)p, skipFsync); } /* @@ -418,7 +495,29 @@ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { + PageHeader p = (PageHeader) buffer; (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer); + Assert(PageIsNew(p) || PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION); + if (page_checksum && p->cksum != INVALID_CKSUM) { + const uint32 diskCksum = p->cksum; + uint32 cksum; + + p->cksum = 0; + cksum = ComputeChecksum((uint64 *)buffer, BLCKSZ, + ChecksumInit(reln, forknum, blocknum)); + + if (cksum != diskCksum) { + ereport(PANIC, (0, errmsg("checksum mismatch: disk has %#x, should be %#x\n" + "filename %s, BlockNum %u, block specifier %d/%d/%d/%d/%u", + diskCksum, (uint32)cksum, + relpath(reln->smgr_rnode, forknum), + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, blocknum))); + } + } } /* @@ -440,8 +539,25 @@ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { + PageHeader p; + + if (page_checksum) { + p = (PageHeader)tempbuf; + ((PageHeader)buffer)->cksum = 0; + /* + * We copy and compute the checksum, then write out the data + * from the copy so that we avoid any problem with hint bits + * changing after we compute the checksum. + */ + p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, BLCKSZ, + ChecksumInit(reln, forknum, blocknum)); + } else { + p = (PageHeader)buffer; + p->cksum = INVALID_CKSUM; + } + Assert(PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION); (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum, - buffer, skipFsync); + (char *)p, skipFsync); } /* diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index da7b6d4..332b960 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -419,6 +419,7 @@ bool default_with_oids = false; bool SQL_inheritance = true; bool Password_encryption = true; +bool page_checksum = true; int log_min_error_statement = ERROR; int log_min_messages = WARNING; @@ -1438,6 +1439,14 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"page_checksum", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Enable disk page checksums."), + NULL, + }, + &page_checksum, true, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 315db46..6a107b9 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -167,6 +167,8 @@ # fsync_writethrough # open_sync #full_page_writes = on # recover from partial page writes +#page_cksum = on # checksum disk pages + #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 14e177d..05ae537 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201112071 +#define CATALOG_VERSION_NO 201112141 #endif diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 42d6b10..847f157 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -132,6 +132,7 @@ typedef struct PageHeaderData LocationIndex pd_special; /* offset to start of special space */ uint16 pd_pagesize_version; TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ + uint32 cksum; /* page checksum */ ItemIdData pd_linp[1]; /* beginning of line pointer array */ } PageHeaderData; @@ -165,8 +166,9 @@ typedef PageHeaderData *PageHeader; * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and * added the pd_flags field (by stealing some bits from pd_tli), * as well as adding the pd_prune_xid field (which enlarges the header). + * Release 9.2 uses 5; we added checksums to heap, index and fsm files. */ -#define PG_PAGE_LAYOUT_VERSION 4 +#define PG_PAGE_LAYOUT_VERSION 5 /* ----------------------------------------------------------------