diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0cc3296..a5c20b3 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1685,6 +1685,7 @@ SET ENABLE_SEQSCAN TO OFF;
data corruption, after a system failure. The risks are similar to turning off
fsync, though smaller, and it should be turned off
only based on the same circumstances recommended for that parameter.
+ This parameter must be on when page_cksum is on.
@@ -1701,6 +1702,20 @@ SET ENABLE_SEQSCAN TO OFF;
+
+ page_cksum (boolean)
+
+ page_cksumconfiguration parameter
+
+
+
+ When this parameter is on, the
+ PostgreSQL server writes and
+ checks checksums for each page written to persistent storage.
+
+
+
+
wal_buffers (integer)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 963189d..0fa5f68 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -314,6 +314,9 @@ extern char *optarg;
extern int optind,
opterr;
+extern int page_checksum;
+extern bool fullPageWrites;
+
#ifdef HAVE_INT_OPTRESET
extern int optreset; /* might not be declared by system headers */
#endif
@@ -766,6 +769,29 @@ PostmasterMain(int argc, char *argv[])
(errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"archive\" or \"hot_standby\"")));
/*
+ * The idea here is that there will be checksum matches if there
+ * are partial writes to pages during hardware crashes. The user
+ * should have full_page_writes enabled if page_checksum is
+ * enabled so that these pages are automatically fixed, otherwise
+ * PostgreSQL may get checksum errors after crashes on pages that
+ * are in fact partially written and hence corrupted. With
+ * full_page_writes enabled, PostgreSQL will replace each page
+ * without ever looking at the partially-written page and seeing
+ * an incorrect checksum. Hence, checksums will detect only real
+ * disk corruptions, i.e. places where the disk reported a
+ * successful write but the data was still corrupted at some
+ * point.
+ *
+ * Alternatively, we may want to leave this check out. This would
+ * be for sophisticated users who have some other guarantee
+ * (hardware and/or software) against ever producing a partial
+ * write during crashes.
+ */
+ if (page_checksum && !fullPageWrites)
+ ereport(ERROR,
+ (errmsg("full_page_writes must be enabled if page_checksum is enabled.")));
+
+ /*
* Other one-time internal sanity checks can go here, if they are fast.
* (Put any slow processing further down, after postmaster.pid creation.)
*/
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4f607cd..1756d62 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -17,6 +17,7 @@
*/
#include "postgres.h"
+#include "catalog/catalog.h"
#include "commands/tablespace.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
@@ -79,6 +80,12 @@ static const int NSmgr = lengthof(smgrsw);
*/
static HTAB *SMgrRelationHash = NULL;
+/* Page checksumming. */
+static uint64 tempbuf[BLCKSZ/sizeof(uint64)];
+extern bool page_checksum;
+
+#define INVALID_CKSUM 0x1b0af034
+
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
@@ -381,6 +388,59 @@ smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool isRedo)
}
/*
+ * The initial value when computing the checksum for a data page.
+ */
+static inline uint64
+ChecksumInit(SMgrRelation reln, ForkNumber f, BlockNumber b)
+{
+ return b + f;
+}
+
+/*
+ * Compute a checksum of a buffer (with length len), using initial
+ * value cksum. We use a relatively simple checksum calculation to
+ * avoid overhead, but could replace with some kind of CRC
+ * calculation.
+ */
+static inline uint32
+ComputeChecksum(uint64 *buffer, uint32 len, uint64 cksum)
+{
+ int i;
+
+ for (i = 0; i < len/sizeof(uint64); i += 4) {
+ cksum += (cksum << 5) + *buffer;
+ cksum += (cksum << 5) + *(buffer+1);
+ cksum += (cksum << 5) + *(buffer+2);
+ cksum += (cksum << 5) + *(buffer+3);
+ buffer += 4;
+ }
+ cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32);
+ return cksum;
+}
+
+/*
+ * Copy buffer to dst and compute the checksum during the copy (so
+ * that the checksum is correct for the final contents fo dst).
+ */
+static inline uint32
+CopyAndComputeChecksum(uint64 *dst, volatile uint64 *buffer,
+ uint32 len, uint64 cksum)
+{
+ int i;
+
+ for (i = 0; i < len/sizeof(uint64); i += 4) {
+ cksum += (cksum << 5) + (*dst = *buffer);
+ cksum += (cksum << 5) + (*(dst+1) = *(buffer+1));
+ cksum += (cksum << 5) + (*(dst+2) = *(buffer+2));
+ cksum += (cksum << 5) + (*(dst+3) = *(buffer+3));
+ dst += 4;
+ buffer += 4;
+ }
+ cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32);
+ return cksum;
+}
+
+/*
* smgrextend() -- Add a new block to a file.
*
* The semantics are nearly the same as smgrwrite(): write at the
@@ -393,8 +453,25 @@ void
smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
+ PageHeader p;
+ Assert(PageGetPageLayoutVersion(((PageHeader)buffer)) == PG_PAGE_LAYOUT_VERSION ||
+ PageIsNew(buffer));
+ if (page_checksum) {
+ p = (PageHeader)tempbuf;
+ ((PageHeader)buffer)->cksum = 0;
+ /*
+ * We copy and compute the checksum, and then write out the
+ * data from the copy to avoid any problem with hint bits
+ * changing after we compute the checksum.
+ */
+ p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, BLCKSZ,
+ ChecksumInit(reln, forknum, blocknum));
+ } else {
+ p = (PageHeader)buffer;
+ p->cksum = INVALID_CKSUM;
+ }
(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
- buffer, skipFsync);
+ (char *)p, skipFsync);
}
/*
@@ -418,7 +495,29 @@ void
smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer)
{
+ PageHeader p = (PageHeader) buffer;
(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
+ Assert(PageIsNew(p) || PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION);
+ if (page_checksum && p->cksum != INVALID_CKSUM) {
+ const uint32 diskCksum = p->cksum;
+ uint32 cksum;
+
+ p->cksum = 0;
+ cksum = ComputeChecksum((uint64 *)buffer, BLCKSZ,
+ ChecksumInit(reln, forknum, blocknum));
+
+ if (cksum != diskCksum) {
+ ereport(PANIC, (0, errmsg("checksum mismatch: disk has %#x, should be %#x\n"
+ "filename %s, BlockNum %u, block specifier %d/%d/%d/%d/%u",
+ diskCksum, (uint32)cksum,
+ relpath(reln->smgr_rnode, forknum),
+ blocknum,
+ reln->smgr_rnode.node.spcNode,
+ reln->smgr_rnode.node.dbNode,
+ reln->smgr_rnode.node.relNode,
+ forknum, blocknum)));
+ }
+ }
}
/*
@@ -440,8 +539,25 @@ void
smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
+ PageHeader p;
+
+ if (page_checksum) {
+ p = (PageHeader)tempbuf;
+ ((PageHeader)buffer)->cksum = 0;
+ /*
+ * We copy and compute the checksum, then write out the data
+ * from the copy so that we avoid any problem with hint bits
+ * changing after we compute the checksum.
+ */
+ p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, BLCKSZ,
+ ChecksumInit(reln, forknum, blocknum));
+ } else {
+ p = (PageHeader)buffer;
+ p->cksum = INVALID_CKSUM;
+ }
+ Assert(PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION);
(*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
- buffer, skipFsync);
+ (char *)p, skipFsync);
}
/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index da7b6d4..332b960 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -419,6 +419,7 @@ bool default_with_oids = false;
bool SQL_inheritance = true;
bool Password_encryption = true;
+bool page_checksum = true;
int log_min_error_statement = ERROR;
int log_min_messages = WARNING;
@@ -1438,6 +1439,14 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
+ {
+ {"page_checksum", PGC_POSTMASTER, CUSTOM_OPTIONS,
+ gettext_noop("Enable disk page checksums."),
+ NULL,
+ },
+ &page_checksum, true, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 315db46..6a107b9 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -167,6 +167,8 @@
# fsync_writethrough
# open_sync
#full_page_writes = on # recover from partial page writes
+#page_cksum = on # checksum disk pages
+
#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers
# (change requires restart)
#wal_writer_delay = 200ms # 1-10000 milliseconds
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 14e177d..05ae537 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 201112071
+#define CATALOG_VERSION_NO 201112141
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 42d6b10..847f157 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -132,6 +132,7 @@ typedef struct PageHeaderData
LocationIndex pd_special; /* offset to start of special space */
uint16 pd_pagesize_version;
TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
+ uint32 cksum; /* page checksum */
ItemIdData pd_linp[1]; /* beginning of line pointer array */
} PageHeaderData;
@@ -165,8 +166,9 @@ typedef PageHeaderData *PageHeader;
* Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
* added the pd_flags field (by stealing some bits from pd_tli),
* as well as adding the pd_prune_xid field (which enlarges the header).
+ * Release 9.2 uses 5; we added checksums to heap, index and fsm files.
*/
-#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_PAGE_LAYOUT_VERSION 5
/* ----------------------------------------------------------------