*** a/contrib/pg_upgrade/controldata.c
--- b/contrib/pg_upgrade/controldata.c
***************
*** 56,61 **** get_control_data(ClusterInfo *cluster, bool live_check)
--- 56,62 ----
bool got_toast = false;
bool got_date_is_int = false;
bool got_float8_pass_by_value = false;
+ bool got_data_checksums = false;
char *lc_collate = NULL;
char *lc_ctype = NULL;
char *lc_monetary = NULL;
***************
*** 131,136 **** get_control_data(ClusterInfo *cluster, bool live_check)
--- 132,144 ----
got_float8_pass_by_value = true;
}
+ /* Only in <= 9.2 */
+ if (GET_MAJOR_VERSION(cluster->major_version) <= 902)
+ {
+ cluster->controldata.data_checksums = false;
+ got_data_checksums = true;
+ }
+
/* we have the result of cmd in "output". so parse it line by line now */
while (fgets(bufin, sizeof(bufin), output))
{
***************
*** 393,398 **** get_control_data(ClusterInfo *cluster, bool live_check)
--- 401,418 ----
cluster->controldata.float8_pass_by_value = strstr(p, "by value") != NULL;
got_float8_pass_by_value = true;
}
+ else if ((p = strstr(bufin, "checksums")) != NULL)
+ {
+ p = strchr(p, ':');
+
+ if (p == NULL || strlen(p) <= 1)
+ pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+ p++; /* removing ':' char */
+ /* used later for contrib check */
+ cluster->controldata.data_checksums = strstr(p, "enabled") != NULL;
+ got_data_checksums = true;
+ }
/* In pre-8.4 only */
else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL)
{
***************
*** 475,481 **** get_control_data(ClusterInfo *cluster, bool live_check)
!got_tli ||
!got_align || !got_blocksz || !got_largesz || !got_walsz ||
!got_walseg || !got_ident || !got_index || !got_toast ||
! !got_date_is_int || !got_float8_pass_by_value)
{
pg_log(PG_REPORT,
"The %s cluster lacks some required control information:\n",
--- 495,501 ----
!got_tli ||
!got_align || !got_blocksz || !got_largesz || !got_walsz ||
!got_walseg || !got_ident || !got_index || !got_toast ||
! !got_date_is_int || !got_float8_pass_by_value || !got_data_checksums)
{
pg_log(PG_REPORT,
"The %s cluster lacks some required control information:\n",
***************
*** 533,538 **** get_control_data(ClusterInfo *cluster, bool live_check)
--- 553,562 ----
if (!got_float8_pass_by_value)
pg_log(PG_REPORT, " float8 argument passing method\n");
+ /* value added in Postgres 9.3 */
+ if (!got_data_checksums)
+ pg_log(PG_REPORT, " data checksums\n");
+
pg_log(PG_FATAL,
"Cannot continue without required control information, terminating\n");
}
***************
*** 594,599 **** check_control_data(ControlData *oldctrl,
--- 618,629 ----
"--disable-integer-datetimes or get server binaries built with those\n"
"options.\n");
}
+
+ if (oldctrl->data_checksums != newctrl->data_checksums)
+ {
+ pg_log(PG_FATAL,
+ "old and new pg_controldata checksums settings are invalid or do not match\n");
+ }
}
*** a/contrib/pg_upgrade/pg_upgrade.h
--- b/contrib/pg_upgrade/pg_upgrade.h
***************
*** 199,204 **** typedef struct
--- 199,205 ----
uint32 toast;
bool date_is_int;
bool float8_pass_by_value;
+ bool data_checksums;
char *lc_collate;
char *lc_ctype;
char *encoding;
*** a/doc/src/sgml/ref/initdb.sgml
--- b/doc/src/sgml/ref/initdb.sgml
***************
*** 183,188 **** PostgreSQL documentation
--- 183,201 ----
+
+
+
+
+ Use checksums on data pages to help detect corruption by the
+ I/O system that would otherwise be silent. Enabling checksums
+ may incur a slight performance penalty. This option can only
+ be set during initialization, and cannot be changed later. See .
+
+
+
+
+
*** a/doc/src/sgml/wal.sgml
--- b/doc/src/sgml/wal.sgml
***************
*** 177,182 ****
--- 177,208 ----
(BBU) disk controllers do not prevent partial page writes unless
they guarantee that data is written to the BBU as full (8kB) pages.
+
+
+ Checksums
+
+
+ checksums
+
+
+
+ Even data recorded to disk may be lost due to media failure or
+ other corruption. While PostgreSQL cannot do anything to prevent
+ such loss, checksums allow early detection of those
+ problems. Detecting such corruption quickly is crucial before
+ taking a backup or rebuilding a replication slave; otherwise,
+ there is a chance that the corruption could make it to the backup
+ or replica.
+
+
+
+ The WAL is always protected by a checksum, which prevents
+ corrupted WAL records from being replayed during recovery. To
+ protect data pages, so that corrupt data pages aren't read into
+ shared memory, checksums must be enabled
+ using .
+
+
*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
***************
*** 287,295 **** hashgettuple(PG_FUNCTION_ARGS)
/*
* Since this can be redone later if needed, it's treated the same
* as a commit-hint-bit status update for heap tuples: we mark the
! * buffer dirty but don't make a WAL log entry.
*/
! SetBufferCommitInfoNeedsSave(buf);
}
/*
--- 287,296 ----
/*
* Since this can be redone later if needed, it's treated the same
* as a commit-hint-bit status update for heap tuples: we mark the
! * buffer dirty, but avoid writing WAL unless we require a
! * full-page image (e.g. if checksums are enabled).
*/
! MarkBufferDirtyHint(buf);
}
/*
*** a/src/backend/access/hash/hashpage.c
--- b/src/backend/access/hash/hashpage.c
***************
*** 712,717 **** _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
--- 712,718 ----
MemSet(zerobuf, 0, sizeof(zerobuf));
RelationOpenSmgr(rel);
+ /* no need to set page checksum for all-zero pages */
smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
return true;
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 4859,4865 **** l4:
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata);
PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
}
END_CRIT_SECTION();
--- 4859,4864 ----
***************
*** 5714,5730 **** log_heap_freeze(Relation reln, Buffer buffer,
* being marked all-visible, and vm_buffer is the buffer containing the
* corresponding visibility map block. Both should have already been modified
* and dirtied.
*/
XLogRecPtr
! log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
TransactionId cutoff_xid)
{
xl_heap_visible xlrec;
XLogRecPtr recptr;
! XLogRecData rdata[2];
xlrec.node = rnode;
! xlrec.block = block;
xlrec.cutoff_xid = cutoff_xid;
rdata[0].data = (char *) &xlrec;
--- 5713,5735 ----
* being marked all-visible, and vm_buffer is the buffer containing the
* corresponding visibility map block. Both should have already been modified
* and dirtied.
+ *
+ * If checksums are enabled, we also add the heap_buffer to the chain to
+ * protect it from being torn.
*/
XLogRecPtr
! log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
TransactionId cutoff_xid)
{
xl_heap_visible xlrec;
XLogRecPtr recptr;
! XLogRecData rdata[3];
!
! Assert(BufferIsValid(heap_buffer));
! Assert(BufferIsValid(vm_buffer));
xlrec.node = rnode;
! xlrec.block = BufferGetBlockNumber(heap_buffer);
xlrec.cutoff_xid = cutoff_xid;
rdata[0].data = (char *) &xlrec;
***************
*** 5738,5743 **** log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
--- 5743,5759 ----
rdata[1].buffer_std = false;
rdata[1].next = NULL;
+ if (DataChecksumsEnabled())
+ {
+ rdata[1].next = &(rdata[2]);
+
+ rdata[2].data = NULL;
+ rdata[2].len = 0;
+ rdata[2].buffer = heap_buffer;
+ rdata[2].buffer_std = true;
+ rdata[2].next = NULL;
+ }
+
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata);
return recptr;
***************
*** 6099,6106 **** static void
heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
- Buffer buffer;
- Page page;
/*
* If there are any Hot Standby transactions running that have an xmin
--- 6115,6120 ----
***************
*** 6115,6153 **** heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node);
/*
! * Read the heap page, if it still exists. If the heap file has been
! * dropped or truncated later in recovery, we don't need to update the
! * page, but we'd better still update the visibility map.
*/
! buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block,
! RBM_NORMAL);
! if (BufferIsValid(buffer))
{
! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
!
! page = (Page) BufferGetPage(buffer);
/*
! * We don't bump the LSN of the heap page when setting the visibility
! * map bit, because that would generate an unworkable volume of
! * full-page writes. This exposes us to torn page hazards, but since
! * we're not inspecting the existing page contents in any way, we
! * don't care.
! *
! * However, all operations that clear the visibility map bit *do* bump
! * the LSN, and those operations will only be replayed if the XLOG LSN
! * follows the page LSN. Thus, if the page LSN has advanced past our
! * XLOG record's LSN, we mustn't mark the page all-visible, because
! * the subsequent update won't be replayed to clear the flag.
*/
! if (lsn > PageGetLSN(page))
{
! PageSetAllVisible(page);
! MarkBufferDirty(buffer);
! }
! /* Done with heap page. */
! UnlockReleaseBuffer(buffer);
}
/*
--- 6129,6184 ----
ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node);
/*
! * If heap block was backed up, restore it. This can only happen with
! * checksums enabled.
*/
! if (record->xl_info & XLR_BKP_BLOCK(1))
{
! Assert(DataChecksumsEnabled());
! (void) RestoreBackupBlock(lsn, record, 1, false, false);
! }
! else
! {
! Buffer buffer;
! Page page;
/*
! * Read the heap page, if it still exists. If the heap file has been
! * dropped or truncated later in recovery, we don't need to update the
! * page, but we'd better still update the visibility map.
*/
! buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM,
! xlrec->block, RBM_NORMAL);
! if (BufferIsValid(buffer))
{
! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! page = (Page) BufferGetPage(buffer);
!
! /*
! * We don't bump the LSN of the heap page when setting the
! * visibility map bit (unless checksums are enabled, in which case
! * we must), because that would generate an unworkable volume of
! * full-page writes. This exposes us to torn page hazards, but
! * since we're not inspecting the existing page contents in any
! * way, we don't care.
! *
! * However, all operations that clear the visibility map bit *do*
! * bump the LSN, and those operations will only be replayed if the
! * XLOG LSN follows the page LSN. Thus, if the page LSN has
! * advanced past our XLOG record's LSN, we mustn't mark the page
! * all-visible, because the subsequent update won't be replayed to
! * clear the flag.
! */
! if (lsn > PageGetLSN(page))
! {
! PageSetAllVisible(page);
! MarkBufferDirty(buffer);
! }
!
! /* Done with heap page. */
! UnlockReleaseBuffer(buffer);
! }
}
/*
***************
*** 6178,6184 **** heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
* real harm is done; and the next VACUUM will fix it.
*/
if (lsn > PageGetLSN(BufferGetPage(vmbuffer)))
! visibilitymap_set(reln, xlrec->block, lsn, vmbuffer,
xlrec->cutoff_xid);
ReleaseBuffer(vmbuffer);
--- 6209,6215 ----
* real harm is done; and the next VACUUM will fix it.
*/
if (lsn > PageGetLSN(BufferGetPage(vmbuffer)))
! visibilitymap_set(reln, xlrec->block, InvalidBuffer, lsn, vmbuffer,
xlrec->cutoff_xid);
ReleaseBuffer(vmbuffer);
***************
*** 6927,6933 **** heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record)
HeapTupleHeaderSetXmax(htup, xlrec->xmax);
PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
--- 6958,6963 ----
*** a/src/backend/access/heap/pruneheap.c
--- b/src/backend/access/heap/pruneheap.c
***************
*** 262,268 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
! SetBufferCommitInfoNeedsSave(buffer);
}
}
--- 262,268 ----
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
! MarkBufferDirtyHint(buffer);
}
}
*** a/src/backend/access/heap/rewriteheap.c
--- b/src/backend/access/heap/rewriteheap.c
***************
*** 273,278 **** end_heap_rewrite(RewriteState state)
--- 273,280 ----
/* Write the last page, if any */
if (state->rs_buffer_valid)
{
+ PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
+
if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
***************
*** 614,619 **** raw_heap_insert(RewriteState state, HeapTuple tup)
--- 616,623 ----
{
/* Doesn't fit, so write out the existing page */
+ PageSetChecksumInplace(page, state->rs_blockno);
+
/* XLOG stuff */
if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
*** a/src/backend/access/heap/visibilitymap.c
--- b/src/backend/access/heap/visibilitymap.c
***************
*** 233,245 **** visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
* marked all-visible; it is needed for Hot Standby, and can be
* InvalidTransactionId if the page contains no tuples.
*
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
! visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
! Buffer buf, TransactionId cutoff_xid)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
--- 233,250 ----
* marked all-visible; it is needed for Hot Standby, and can be
* InvalidTransactionId if the page contains no tuples.
*
+ * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
+ * this function. Except in recovery, caller should also pass the heap
+ * buffer. When checksums are enabled and we're not in recovery, we must add
+ * the heap buffer to the WAL chain to protect it from being torn.
+ *
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
! visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
! XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
***************
*** 252,285 **** visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
#endif
Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
! /* Check that we have the right page pinned */
! if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
! elog(ERROR, "wrong buffer passed to visibilitymap_set");
! page = BufferGetPage(buf);
map = PageGetContents(page);
! LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
if (!(map[mapByte] & (1 << mapBit)))
{
START_CRIT_SECTION();
map[mapByte] |= (1 << mapBit);
! MarkBufferDirty(buf);
if (RelationNeedsWAL(rel))
{
if (XLogRecPtrIsInvalid(recptr))
! recptr = log_heap_visible(rel->rd_node, heapBlk, buf,
cutoff_xid);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
! LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/*
--- 257,311 ----
#endif
Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
+ Assert(InRecovery || BufferIsValid(heapBuf));
! /* Check that we have the right heap page pinned, if present */
! if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
! elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
! /* Check that we have the right VM page pinned */
! if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
! elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
!
! page = BufferGetPage(vmBuf);
map = PageGetContents(page);
! LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
if (!(map[mapByte] & (1 << mapBit)))
{
START_CRIT_SECTION();
map[mapByte] |= (1 << mapBit);
! MarkBufferDirty(vmBuf);
if (RelationNeedsWAL(rel))
{
if (XLogRecPtrIsInvalid(recptr))
! {
! Assert(!InRecovery);
! recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
cutoff_xid);
+
+ /*
+ * If data checksums are enabled, we need to protect the heap
+ * page from being torn.
+ */
+ if (DataChecksumsEnabled())
+ {
+ Page heapPage = BufferGetPage(heapBuf);
+
+ /* caller is expected to set PD_ALL_VISIBLE first */
+ Assert(PageIsAllVisible(heapPage));
+ PageSetLSN(heapPage, recptr);
+ }
+ }
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
! LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
}
/*
***************
*** 579,584 **** vm_extend(Relation rel, BlockNumber vm_nblocks)
--- 605,612 ----
/* Now extend the file */
while (vm_nblocks_now < vm_nblocks)
{
+ PageSetChecksumInplace(pg, vm_nblocks_now);
+
smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
(char *) pg, false);
vm_nblocks_now++;
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***************
*** 405,415 **** _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
*/
ItemIdMarkDead(curitemid);
opaque->btpo_flags |= BTP_HAS_GARBAGE;
! /* be sure to mark the proper buffer dirty... */
if (nbuf != InvalidBuffer)
! SetBufferCommitInfoNeedsSave(nbuf);
else
! SetBufferCommitInfoNeedsSave(buf);
}
}
}
--- 405,420 ----
*/
ItemIdMarkDead(curitemid);
opaque->btpo_flags |= BTP_HAS_GARBAGE;
!
! /*
! * Be sure to mark the proper buffer dirty. If checksums
! * are enabled, this may also require a full-page image
! * (see comments in MarkBufferDirtyHint).
! */
if (nbuf != InvalidBuffer)
! MarkBufferDirtyHint(nbuf);
else
! MarkBufferDirtyHint(buf);
}
}
}
*** a/src/backend/access/nbtree/nbtree.c
--- b/src/backend/access/nbtree/nbtree.c
***************
*** 217,222 **** btbuildempty(PG_FUNCTION_ARGS)
--- 217,223 ----
_bt_initmetapage(metapage, P_NONE, 0);
/* Write the page. If archiving/streaming, XLOG it. */
+ PageSetChecksumInplace(metapage, BTREE_METAPAGE);
smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
(char *) metapage, true);
if (XLogIsNeeded())
***************
*** 1051,1057 **** restart:
opaque->btpo_cycleid == vstate->cycleid)
{
opaque->btpo_cycleid = 0;
! SetBufferCommitInfoNeedsSave(buf);
}
}
--- 1052,1058 ----
opaque->btpo_cycleid == vstate->cycleid)
{
opaque->btpo_cycleid = 0;
! MarkBufferDirtyHint(buf);
}
}
*** a/src/backend/access/nbtree/nbtsort.c
--- b/src/backend/access/nbtree/nbtsort.c
***************
*** 284,295 **** _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
--- 284,298 ----
{
if (!wstate->btws_zeropage)
wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+ /* no need to set checksum for all-zero pages */
smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
wstate->btws_pages_written++,
(char *) wstate->btws_zeropage,
true);
}
+ PageSetChecksumInplace(page, blkno);
+
/*
* Now write the page. There's no need for smgr to schedule an fsync for
* this write; we'll do it ourselves before ending the build.
*** a/src/backend/access/nbtree/nbtutils.c
--- b/src/backend/access/nbtree/nbtutils.c
***************
*** 1783,1789 **** _bt_killitems(IndexScanDesc scan, bool haveLock)
/*
* Since this can be redone later if needed, it's treated the same as a
* commit-hint-bit status update for heap tuples: we mark the buffer dirty
! * but don't make a WAL log entry.
*
* Whenever we mark anything LP_DEAD, we also set the page's
* BTP_HAS_GARBAGE flag, which is likewise just a hint.
--- 1783,1790 ----
/*
* Since this can be redone later if needed, it's treated the same as a
* commit-hint-bit status update for heap tuples: we mark the buffer dirty
! * but avoid writing WAL unless we require a full-page image (e.g. if
! * checksums are enabled).
*
* Whenever we mark anything LP_DEAD, we also set the page's
* BTP_HAS_GARBAGE flag, which is likewise just a hint.
***************
*** 1791,1797 **** _bt_killitems(IndexScanDesc scan, bool haveLock)
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
! SetBufferCommitInfoNeedsSave(so->currPos.buf);
}
if (!haveLock)
--- 1792,1798 ----
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
! MarkBufferDirtyHint(so->currPos.buf);
}
if (!haveLock)
*** a/src/backend/access/rmgrdesc/xlogdesc.c
--- b/src/backend/access/rmgrdesc/xlogdesc.c
***************
*** 79,84 **** xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
--- 79,88 ----
appendStringInfo(buf, "restore point: %s", xlrec->rp_name);
}
+ else if (info == XLOG_HINT)
+ {
+ appendStringInfo(buf, "page hint");
+ }
else if (info == XLOG_BACKUP_END)
{
XLogRecPtr startpoint;
*** a/src/backend/access/spgist/spginsert.c
--- b/src/backend/access/spgist/spginsert.c
***************
*** 154,159 **** spgbuildempty(PG_FUNCTION_ARGS)
--- 154,160 ----
SpGistInitMetapage(page);
/* Write the page. If archiving/streaming, XLOG it. */
+ PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
(char *) page, true);
if (XLogIsNeeded())
***************
*** 163,168 **** spgbuildempty(PG_FUNCTION_ARGS)
--- 164,170 ----
/* Likewise for the root page. */
SpGistInitPage(page, SPGIST_LEAF);
+ PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO,
(char *) page, true);
if (XLogIsNeeded())
***************
*** 172,177 **** spgbuildempty(PG_FUNCTION_ARGS)
--- 174,180 ----
/* Likewise for the null-tuples root page. */
SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
+ PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO,
(char *) page, true);
if (XLogIsNeeded())
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 61,66 ****
--- 61,67 ----
#include "utils/timestamp.h"
#include "pg_trace.h"
+ extern bool bootstrap_data_checksums;
/* File path names (all relative to $PGDATA) */
#define RECOVERY_COMMAND_FILE "recovery.conf"
***************
*** 699,704 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
--- 700,706 ----
bool updrqst;
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ bool isHint = (rmid == RM_XLOG_ID && info == XLOG_HINT);
uint8 info_orig = info;
static XLogRecord *rechdr;
***************
*** 969,974 **** begin:;
--- 971,988 ----
}
/*
+ * If this is a hint record and we don't need a backup block then
+ * we have no more work to do and can exit quickly without inserting
+ * a WAL record at all. In that case return InvalidXLogRecPtr.
+ */
+ if (isHint && !(info & XLR_BKP_BLOCK_MASK))
+ {
+ LWLockRelease(WALInsertLock);
+ END_CRIT_SECTION();
+ return InvalidXLogRecPtr;
+ }
+
+ /*
* If the current page is completely full, the record goes to the next
* page, right after the page header.
*/
***************
*** 3156,3161 **** RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
--- 3170,3180 ----
BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
}
+ /*
+ * Any checksum set on this page will be invalid. We don't need
+ * to reset it here since it will be set before being written.
+ */
+
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
***************
*** 3682,3687 **** GetSystemIdentifier(void)
--- 3701,3716 ----
}
/*
+ * Are checksums enabled for data pages?
+ */
+ bool
+ DataChecksumsEnabled(void)
+ {
+ Assert(ControlFile != NULL);
+ return ControlFile->data_checksums;
+ }
+
+ /*
* Auto-tune the number of XLOG buffers.
*
* The preferred setting for wal_buffers is about 3% of shared_buffers, with
***************
*** 3979,3984 **** BootStrapXLOG(void)
--- 4008,4014 ----
ControlFile->max_prepared_xacts = max_prepared_xacts;
ControlFile->max_locks_per_xact = max_locks_per_xact;
ControlFile->wal_level = wal_level;
+ ControlFile->data_checksums = bootstrap_data_checksums;
/* some additional ControlFile fields are set in WriteControlFile() */
***************
*** 7291,7296 **** XLogRestorePoint(const char *rpName)
--- 7321,7371 ----
}
/*
+ * Write a backup block if needed when we are setting a hint. Note that
+ * this may be called for a variety of page types, not just heaps.
+ *
+ * Deciding the "if needed" part is delicate and requires us to either
+ * grab WALInsertLock or check the info_lck spinlock. If we check the
+ * spinlock and it says Yes then we will need to get WALInsertLock as well,
+ * so the design choice here is to just go straight for the WALInsertLock
+ * and trust that calls to this function are minimised elsewhere.
+ *
+ * Callable while holding just share lock on the buffer content.
+ *
+ * Possible that multiple concurrent backends could attempt to write
+ * WAL records. In that case, more than one backup block may be recorded
+ * though that isn't important to the outcome and the backup blocks are
+ * likely to be identical anyway.
+ */
+ #define XLOG_HINT_WATERMARK 13579
+ XLogRecPtr
+ XLogSaveBufferForHint(Buffer buffer)
+ {
+ /*
+ * Make an XLOG entry reporting the hint
+ */
+ XLogRecData rdata[2];
+ int watermark = XLOG_HINT_WATERMARK;
+
+ /*
+ * Not allowed to have zero-length records, so use a small watermark
+ */
+ rdata[0].data = (char *) (&watermark);
+ rdata[0].len = sizeof(int);
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].buffer_std = false;
+ rdata[0].next = &(rdata[1]);
+
+ rdata[1].data = NULL;
+ rdata[1].len = 0;
+ rdata[1].buffer = buffer;
+ rdata[1].buffer_std = true;
+ rdata[1].next = NULL;
+
+ return XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
+ }
+
+ /*
* Check if any of the GUC parameters that are critical for hot standby
* have changed, and update the value in pg_control file if necessary.
*/
***************
*** 7451,7458 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
! /* Backup blocks are not used in xlog records */
! Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
if (info == XLOG_NEXTOID)
{
--- 7526,7533 ----
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
! /* Backup blocks are not used in most xlog records */
! Assert(info == XLOG_HINT || !(record->xl_info & XLR_BKP_BLOCK_MASK));
if (info == XLOG_NEXTOID)
{
***************
*** 7624,7629 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
--- 7699,7732 ----
{
/* nothing to do here */
}
+ else if (info == XLOG_HINT)
+ {
+ #ifdef USE_ASSERT_CHECKING
+ int *watermark = (int *) XLogRecGetData(record);
+ #endif
+
+ /* Check the watermark is correct for the hint record */
+ Assert(*watermark == XLOG_HINT_WATERMARK);
+
+ /* Backup blocks must be present for smgr hint records */
+ Assert(record->xl_info & XLR_BKP_BLOCK_MASK);
+
+ /*
+ * Hint records have no information that needs to be replayed.
+ * The sole purpose of them is to ensure that a hint bit does
+ * not cause a checksum invalidation if a hint bit write should
+ * cause a torn page. So the body of the record is empty but
+ * there must be one backup block.
+ *
+ * Since the only change in the backup block is a hint bit,
+ * there is no confict with Hot Standby.
+ *
+ * This also means there is no corresponding API call for this,
+ * so an smgr implementation has no need to implement anything.
+ * Which means nothing is needed in md.c etc
+ */
+ RestoreBackupBlock(lsn, record, 0, false, false);
+ }
else if (info == XLOG_BACKUP_END)
{
XLogRecPtr startpoint;
*** a/src/backend/bootstrap/bootstrap.c
--- b/src/backend/bootstrap/bootstrap.c
***************
*** 48,53 ****
--- 48,55 ----
extern int optind;
extern char *optarg;
+ bool bootstrap_data_checksums = false;
+
#define ALLOC(t, c) ((t *) calloc((unsigned)(c), sizeof(t)))
***************
*** 233,239 **** AuxiliaryProcessMain(int argc, char *argv[])
/* If no -x argument, we are a CheckerProcess */
MyAuxProcType = CheckerProcess;
! while ((flag = getopt(argc, argv, "B:c:d:D:Fr:x:-:")) != -1)
{
switch (flag)
{
--- 235,241 ----
/* If no -x argument, we are a CheckerProcess */
MyAuxProcType = CheckerProcess;
! while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:-:")) != -1)
{
switch (flag)
{
***************
*** 259,264 **** AuxiliaryProcessMain(int argc, char *argv[])
--- 261,269 ----
case 'F':
SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV);
break;
+ case 'k':
+ bootstrap_data_checksums = true;
+ break;
case 'r':
strlcpy(OutputFileName, optarg, MAXPGPATH);
break;
*** a/src/backend/commands/sequence.c
--- b/src/backend/commands/sequence.c
***************
*** 1115,1121 **** read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple)
HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId);
seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID;
! SetBufferCommitInfoNeedsSave(*buf);
}
seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
--- 1115,1121 ----
HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId);
seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID;
! MarkBufferDirtyHint(*buf);
}
seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
*** a/src/backend/commands/tablecmds.c
--- b/src/backend/commands/tablecmds.c
***************
*** 8813,8818 **** copy_relation_data(SMgrRelation src, SMgrRelation dst,
--- 8813,8820 ----
smgrread(src, forkNum, blkno, buf);
+ PageSetChecksumInplace(page, blkno);
+
/* XLOG stuff */
if (use_wal)
log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page);
*** a/src/backend/commands/vacuumlazy.c
--- b/src/backend/commands/vacuumlazy.c
***************
*** 670,677 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
{
PageSetAllVisible(page);
MarkBufferDirty(buf);
! visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
! InvalidTransactionId);
}
UnlockReleaseBuffer(buf);
--- 670,677 ----
{
PageSetAllVisible(page);
MarkBufferDirty(buf);
! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
! vmbuffer, InvalidTransactionId);
}
UnlockReleaseBuffer(buf);
***************
*** 900,907 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
{
PageSetAllVisible(page);
MarkBufferDirty(buf);
! visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
! visibility_cutoff_xid);
}
else if (!all_visible_according_to_vm)
{
--- 900,907 ----
{
PageSetAllVisible(page);
MarkBufferDirty(buf);
! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
! vmbuffer, visibility_cutoff_xid);
}
else if (!all_visible_according_to_vm)
{
***************
*** 911,918 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* allowed. Set the visibility map bit as well so that we get
* back in sync.
*/
! visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
! visibility_cutoff_xid);
}
}
--- 911,918 ----
* allowed. Set the visibility map bit as well so that we get
* back in sync.
*/
! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
! vmbuffer, visibility_cutoff_xid);
}
}
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 34,39 ****
--- 34,40 ----
#include
#include "catalog/catalog.h"
+ #include "catalog/storage.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pg_trace.h"
***************
*** 459,465 **** ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
/* check for garbage data */
! if (!PageHeaderIsValid((PageHeader) bufBlock))
{
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
{
--- 460,466 ----
}
/* check for garbage data */
! if (!PageIsVerified((Page) bufBlock, blockNum))
{
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
{
***************
*** 654,667 **** BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* victim. We need lock to inspect the page LSN, so this
* can't be done inside StrategyGetBuffer.
*/
! if (strategy != NULL &&
! XLogNeedsFlush(BufferGetLSN(buf)) &&
! StrategyRejectBuffer(strategy, buf))
{
! /* Drop lock/pin and loop around for another buffer */
! LWLockRelease(buf->content_lock);
! UnpinBuffer(buf, true);
! continue;
}
/* OK, do the I/O */
--- 655,677 ----
* victim. We need lock to inspect the page LSN, so this
* can't be done inside StrategyGetBuffer.
*/
! if (strategy != NULL)
{
! XLogRecPtr lsn;
!
! /* Read the LSN while holding buffer header lock */
! LockBufHdr(buf);
! lsn = BufferGetLSN(buf);
! UnlockBufHdr(buf);
!
! if (XLogNeedsFlush(lsn) &&
! StrategyRejectBuffer(strategy, buf))
! {
! /* Drop lock/pin and loop around for another buffer */
! LWLockRelease(buf->content_lock);
! UnpinBuffer(buf, true);
! continue;
! }
}
/* OK, do the I/O */
***************
*** 1893,1898 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
--- 1903,1910 ----
ErrorContextCallback errcallback;
instr_time io_start,
io_time;
+ Block bufBlock;
+ char *bufToWrite;
/*
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
***************
*** 1918,1929 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode);
/*
* Force XLOG flush up to buffer's LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file changes
* they describe do.
*/
- recptr = BufferGetLSN(buf);
XLogFlush(recptr);
/*
--- 1930,1952 ----
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode);
+ LockBufHdr(buf);
+
+ /*
+ * Run PageGetLSN while holding header lock, since we don't have the
+ * buffer locked exclusively in all cases.
+ */
+ recptr = BufferGetLSN(buf);
+
+ /* To check if block content changes while flushing. - vadim 01/17/97 */
+ buf->flags &= ~BM_JUST_DIRTIED;
+ UnlockBufHdr(buf);
+
/*
* Force XLOG flush up to buffer's LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file changes
* they describe do.
*/
XLogFlush(recptr);
/*
***************
*** 1932,1949 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
* we have the io_in_progress lock.
*/
! /* To check if block content changes while flushing. - vadim 01/17/97 */
! LockBufHdr(buf);
! buf->flags &= ~BM_JUST_DIRTIED;
! UnlockBufHdr(buf);
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
smgrwrite(reln,
buf->tag.forkNum,
buf->tag.blockNum,
! (char *) BufHdrGetBlock(buf),
false);
if (track_io_timing)
--- 1955,1974 ----
* we have the io_in_progress lock.
*/
! bufBlock = BufHdrGetBlock(buf);
!
! bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
+ /*
+ * bufToWrite is either the shared buffer or a copy, as appropriate.
+ */
smgrwrite(reln,
buf->tag.forkNum,
buf->tag.blockNum,
! bufToWrite,
false);
if (track_io_timing)
***************
*** 2481,2502 **** IncrBufferRefCount(Buffer buffer)
}
/*
! * SetBufferCommitInfoNeedsSave
*
! * Mark a buffer dirty when we have updated tuple commit-status bits in it.
*
! * This is essentially the same as MarkBufferDirty, except that the caller
! * might have only share-lock instead of exclusive-lock on the buffer's
! * content lock. We preserve the distinction mainly as a way of documenting
! * that the caller has not made a critical data change --- the status-bit
! * update could be redone by someone else just as easily. Therefore, no WAL
! * log record need be generated, whereas calls to MarkBufferDirty really ought
! * to be associated with a WAL-entry-creating action.
*/
void
! SetBufferCommitInfoNeedsSave(Buffer buffer)
{
volatile BufferDesc *bufHdr;
if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer ID: %d", buffer);
--- 2506,2529 ----
}
/*
! * MarkBufferDirtyHint
! *
! * Mark a buffer dirty for non-critical changes.
*
! * This is essentially the same as MarkBufferDirty, except:
*
! * 1. The caller does not write WAL; so if checksums are enabled, we may need
! * to write an XLOG_HINT WAL record to protect against torn pages.
! * 2. The caller might have only share-lock instead of exclusive-lock on the
! * buffer's content lock.
! * 3. This function does not guarantee that the buffer is always marked dirty
! * (due to a race condition), so it cannot be used for important changes.
*/
void
! MarkBufferDirtyHint(Buffer buffer)
{
volatile BufferDesc *bufHdr;
+ Page page = BufferGetPage(buffer);
if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer ID: %d", buffer);
***************
*** 2516,2543 **** SetBufferCommitInfoNeedsSave(Buffer buffer)
/*
* This routine might get called many times on the same page, if we are
* making the first scan after commit of an xact that added/deleted many
! * tuples. So, be as quick as we can if the buffer is already dirty. We
! * do this by not acquiring spinlock if it looks like the status bits are
! * already. Since we make this test unlocked, there's a chance we might
! * fail to notice that the flags have just been cleared, and failed to
! * reset them, due to memory-ordering issues. But since this function is
! * only intended to be used in cases where failing to write out the data
* would be harmless anyway, it doesn't really matter.
*/
if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
(BM_DIRTY | BM_JUST_DIRTIED))
{
LockBufHdr(bufHdr);
Assert(bufHdr->refcount > 0);
if (!(bufHdr->flags & BM_DIRTY))
{
! /* Do vacuum cost accounting */
VacuumPageDirty++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- UnlockBufHdr(bufHdr);
}
}
--- 2543,2647 ----
/*
* This routine might get called many times on the same page, if we are
* making the first scan after commit of an xact that added/deleted many
! * tuples. So, be as quick as we can if the buffer is already dirty. We do
! * this by not acquiring spinlock if it looks like the status bits are
! * already set. Since we make this test unlocked, there's a chance we
! * might fail to notice that the flags have just been cleared, and failed
! * to reset them, due to memory-ordering issues. But since this function
! * is only intended to be used in cases where failing to write out the data
* would be harmless anyway, it doesn't really matter.
*/
if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
(BM_DIRTY | BM_JUST_DIRTIED))
{
+ XLogRecPtr lsn = InvalidXLogRecPtr;
+ bool dirtied = false;
+ bool delayChkpt = false;
+
+ /*
+ * If checksums are enabled, then a full page image may be required
+ * even for some hint bit updates to protect against torn pages. This
+ * full page image is only necessary if the hint bit update is the
+ * first change to the page since the last checkpoint.
+ *
+ * We don't check full_page_writes here because that logic is in
+ * xlog.c.
+ */
+ if (DataChecksumsEnabled())
+ {
+ /*
+ * If we're in recovery we cannot dirty a page because of a hint.
+ * We can set the hint, just not dirty the page as a result so
+ * the hint is lost when we evict the page or shutdown.
+ *
+ * See long discussion in bufpage.c
+ */
+ if (RecoveryInProgress())
+ return;
+
+ /*
+ * If the block is already dirty because we either made a change
+ * or set a hint already, then we don't need to write a full page
+ * image. Note that aggressive cleaning of blocks
+ * dirtied by hint bit setting would increase the call rate.
+ * Bulk setting of hint bits would reduce the call rate...
+ *
+ * We must issue the WAL record before we mark the buffer dirty.
+ * Otherwise we might write the page before we write the WAL.
+ * That causes a race condition, since a checkpoint might occur
+ * between writing the WAL record and marking the buffer dirty.
+ * We solve that with a kluge, but one that is already in use
+ * during transaction commit to prevent race conditions.
+ * Basically, we simply prevent the checkpoint WAL record from
+ * being written until we have marked the buffer dirty. We don't
+ * start the checkpoint flush until we have marked dirty, so our
+ * checkpoint must flush the change to disk successfully or the
+ * checkpoint never gets written, so crash recovery will fix.
+ *
+ * It's possible we may enter here without an xid, so it is
+ * essential that CreateCheckpoint waits for virtual transactions
+ * rather than full transactionids.
+ */
+ MyPgXact->delayChkpt = delayChkpt = true;
+ lsn = XLogSaveBufferForHint(buffer);
+ }
+
LockBufHdr(bufHdr);
Assert(bufHdr->refcount > 0);
if (!(bufHdr->flags & BM_DIRTY))
{
! dirtied = true; /* Means "will be dirtied by this action" */
!
! /*
! * Set the page LSN if we wrote a backup block. We aren't
! * supposed to set this when only holding a share lock but
! * as long as we serialise it somehow we're OK. We choose to
! * set LSN while holding the buffer header lock, which causes
! * any reader of an LSN who holds only a share lock to also
! * obtain a buffer header lock before using PageGetLSN().
! * Fortunately, thats not too many places.
! *
! * If checksums are enabled, you might think we should reset the
! * checksum here. That will happen when the page is written
! * sometime later in this checkpoint cycle.
! */
! if (!XLogRecPtrIsInvalid(lsn))
! {
! PageSetLSN(page, lsn);
! }
! }
! bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! UnlockBufHdr(bufHdr);
!
! if (delayChkpt)
! MyPgXact->delayChkpt = false;
!
! if (dirtied)
! {
VacuumPageDirty++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
}
}
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
***************
*** 200,205 **** LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
--- 200,207 ----
/* Find smgr relation for buffer */
oreln = smgropen(bufHdr->tag.rnode, MyBackendId);
+ /* XXX do we want to write checksums for local buffers? An option? */
+
/* And write... */
smgrwrite(oreln,
bufHdr->tag.forkNum,
*** a/src/backend/storage/freespace/freespace.c
--- b/src/backend/storage/freespace/freespace.c
***************
*** 216,222 **** XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
PageInit(page, BLCKSZ, 0);
if (fsm_set_avail(page, slot, new_cat))
! MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
}
--- 216,222 ----
PageInit(page, BLCKSZ, 0);
if (fsm_set_avail(page, slot, new_cat))
! MarkBufferDirtyHint(buf);
UnlockReleaseBuffer(buf);
}
***************
*** 286,292 **** FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
return; /* nothing to do; the FSM was already smaller */
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
! MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
--- 286,292 ----
return; /* nothing to do; the FSM was already smaller */
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
! MarkBufferDirtyHint(buf);
UnlockReleaseBuffer(buf);
new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
***************
*** 583,588 **** fsm_extend(Relation rel, BlockNumber fsm_nblocks)
--- 583,590 ----
while (fsm_nblocks_now < fsm_nblocks)
{
+ PageSetChecksumInplace(pg, fsm_nblocks_now);
+
smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
(char *) pg, false);
fsm_nblocks_now++;
***************
*** 617,623 **** fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
page = BufferGetPage(buf);
if (fsm_set_avail(page, slot, newValue))
! MarkBufferDirty(buf);
if (minValue != 0)
{
--- 619,625 ----
page = BufferGetPage(buf);
if (fsm_set_avail(page, slot, newValue))
! MarkBufferDirtyHint(buf);
if (minValue != 0)
{
***************
*** 768,774 **** fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
{
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_set_avail(BufferGetPage(buf), slot, child_avail);
! MarkBufferDirty(buf);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
}
--- 770,776 ----
{
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_set_avail(BufferGetPage(buf), slot, child_avail);
! MarkBufferDirtyHint(buf);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
}
*** a/src/backend/storage/freespace/fsmpage.c
--- b/src/backend/storage/freespace/fsmpage.c
***************
*** 284,290 **** restart:
exclusive_lock_held = true;
}
fsm_rebuild_page(page);
! MarkBufferDirty(buf);
goto restart;
}
}
--- 284,290 ----
exclusive_lock_held = true;
}
fsm_rebuild_page(page);
! MarkBufferDirtyHint(buf);
goto restart;
}
}
*** a/src/backend/storage/page/bufpage.c
--- b/src/backend/storage/page/bufpage.c
***************
*** 15,21 ****
--- 15,27 ----
#include "postgres.h"
#include "access/htup_details.h"
+ #include "access/xlog.h"
+ static char pageCopyData[BLCKSZ]; /* for checksum calculation */
+ static Page pageCopy = pageCopyData;
+
+ static bool PageChecksumOK(Page page, BlockNumber blkno);
+ static uint16 PageCalcChecksum16(Page page, BlockNumber blkno);
/* ----------------------------------------------------------------
* Page support functions
***************
*** 25,30 ****
--- 31,38 ----
/*
* PageInit
* Initializes the contents of a page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
*/
void
PageInit(Page page, Size pageSize, Size specialSize)
***************
*** 39,45 **** PageInit(Page page, Size pageSize, Size specialSize)
/* Make sure all fields of page are zero, as well as unused space */
MemSet(p, 0, pageSize);
! /* p->pd_flags = 0; done by above MemSet */
p->pd_lower = SizeOfPageHeaderData;
p->pd_upper = pageSize - specialSize;
p->pd_special = pageSize - specialSize;
--- 47,56 ----
/* Make sure all fields of page are zero, as well as unused space */
MemSet(p, 0, pageSize);
! if (DataChecksumsEnabled())
! p->pd_flags = PD_CHECKSUMS1 | PD_CHECKSUMS2;
! else
! p->pd_flags = 0;
p->pd_lower = SizeOfPageHeaderData;
p->pd_upper = pageSize - specialSize;
p->pd_special = pageSize - specialSize;
***************
*** 49,55 **** PageInit(Page page, Size pageSize, Size specialSize)
/*
! * PageHeaderIsValid
* Check that the header fields of a page appear valid.
*
* This is called when a page has just been read in from disk. The idea is
--- 60,66 ----
/*
! * PageIsVerified
* Check that the header fields of a page appear valid.
*
* This is called when a page has just been read in from disk. The idea is
***************
*** 67,87 **** PageInit(Page page, Size pageSize, Size specialSize)
* will clean up such a page and make it usable.
*/
bool
! PageHeaderIsValid(PageHeader page)
{
char *pagebytes;
int i;
! /* Check normal case */
! if (PageGetPageSize(page) == BLCKSZ &&
! PageGetPageLayoutVersion(page) == PG_PAGE_LAYOUT_VERSION &&
! (page->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
! page->pd_lower >= SizeOfPageHeaderData &&
! page->pd_lower <= page->pd_upper &&
! page->pd_upper <= page->pd_special &&
! page->pd_special <= BLCKSZ &&
! page->pd_special == MAXALIGN(page->pd_special))
! return true;
/* Check all-zeroes case */
pagebytes = (char *) page;
--- 78,104 ----
* will clean up such a page and make it usable.
*/
bool
! PageIsVerified(Page page, BlockNumber blkno)
{
+ PageHeader p = (PageHeader) page;
char *pagebytes;
int i;
! /*
! * Don't verify page data unless the page passes basic non-zero test
! */
! if (!PageIsNew(page))
! {
! /* Check normal case */
! if (PageChecksumOK(page, blkno) &&
! (p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
! (p->pd_flags & PD_HEADERCHECK) == 0 &&
! p->pd_lower <= p->pd_upper &&
! p->pd_upper <= p->pd_special &&
! p->pd_special <= BLCKSZ &&
! p->pd_special == MAXALIGN(p->pd_special))
! return true;
! }
/* Check all-zeroes case */
pagebytes = (char *) page;
***************
*** 827,829 **** PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
--- 844,1074 ----
pfree(itemidbase);
}
+
+ /*
+ * Test whether the page checksum is correct or not.
+ *
+ * IMPORTANT NOTE -
+ * The checksum is not valid at all times on a data page. We set it before we
+ * flush page/buffer, and implicitly invalidate the checksum when we modify the
+ * page. A heavily accessed buffer might then spend most of its life with an
+ * invalid page checksum, so testing random pages in the buffer pool will tell
+ * you nothing. The reason for this is that the checksum detects otherwise
+ * silent errors caused by the filesystems on which we rely. We do not protect
+ * buffers against uncorrectable memory errors, since these have a very low
+ * measured incidence according to research on large server farms,
+ * http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22.
+ *
+ * That means that WAL-logged changes to a page do NOT update the page
+ * checksum, so full page images may not have a valid checksum. But those page
+ * images have the WAL CRC covering them and so are verified separately from
+ * this mechanism.
+ *
+ * Any write of a data block can cause a torn page if the write is unsuccessful.
+ * Full page writes protect us from that, which are stored in WAL. Setting
+ * hint bits when a page is already dirty is OK because a full page write
+ * must already have been written for that since the last checkpoint.
+ * Setting hint bits on an otherwise clean page can allow torn pages; this
+ * doesn't normally matter since they are just hints. When the page has
+ * checksums, losing a few bits would cause the checksum to be invalid.
+ * So if we have full_page_writes = on and checksums enabled then we must
+ * write a WAL record specifically so that we record a full page image in WAL.
+ * New WAL records cannot be written during recovery, so hint bits set
+ * during recovery must not dirty the page if the buffer is not already dirty,
+ * when checksums are enabled.
+ *
+ * WAL replay ignores page checksums unless it writes out or reads in blocks
+ * from disk; restoring full page images does not verify checksums via this
+ * function.
+ *
+ * The best way to understand this is that WAL CRCs protect records entering
+ * the WAL stream, and page verification protects blocks entering the shared
+ * buffer pool. They are similar in purpose, yet completely separate.
+ * Together they ensure we are able to detect errors in data re-entering
+ * PostgreSQL-controlled memory. Note also that the WAL checksum is a
+ * 32-bit CRC, whereas the page checksum is a Fletcher checksum, not a CRC.
+ *
+ * This function returns a boolean, not a full damage assessment.
+ */
+ static bool
+ PageChecksumOK(Page page, BlockNumber blkno)
+ {
+ PageHeader p = (PageHeader) page;
+ uint16 checksum;
+ uint16 checksum_mask = PD_CHECKSUMS1 | PD_CHECKSUMS2;
+
+ /* Quick exit if nobody cares about checksumming */
+ if (!DataChecksumsEnabled())
+ {
+ /*
+ * We don't verify that the checksum itself is zero here, because pages
+ * upgraded from previous versions may still hold the TLI in the
+ * checksum field.
+ */
+ if ((p->pd_flags & checksum_mask) != 0)
+ {
+ ereport(WARNING,
+ (ERRCODE_DATA_CORRUPTED,
+ errmsg("unexpected checksum flags on page")));
+ return false;
+ }
+ return true;
+ }
+
+ if ((p->pd_flags & checksum_mask) != checksum_mask)
+ {
+ ereport(WARNING,
+ (ERRCODE_DATA_CORRUPTED,
+ errmsg("checksum flags missing on page")));
+ return false;
+ }
+
+ checksum = PageCalcChecksum16(page, blkno);
+
+ if (checksum != p->pd_checksum)
+ {
+ ereport(WARNING,
+ (ERRCODE_DATA_CORRUPTED,
+ errmsg("page verification failed, calculated checksum %u but expected %u",
+ checksum, p->pd_checksum)));
+ return false;
+ }
+
+ return true;
+ }
+
+ /*
+ * Set checksum for page in shared buffers.
+ *
+ * If checksums are disabled, or if the page is not initialized, just return
+ * the input. Otherwise, we must make a copy of the page before calculating the
+ * checksum, to prevent concurrent modifications (e.g. setting hint bits) from
+ * making the final checksum invalid.
+ *
+ * Returns a pointer to the block-sized data that needs to be written. Uses
+ * statically-allocated memory, so the caller must immediately write the
+ * returned page and not refer to it again.
+ */
+ char *
+ PageSetChecksumCopy(Page page, BlockNumber blkno)
+ {
+ if (PageIsNew(page) || !DataChecksumsEnabled())
+ return (char *) page;
+
+ /*
+ * We make a copy iff we need to calculate a checksum because other
+ * backends may set hint bits on this page while we write, which
+ * would mean the checksum differs from the page contents. It doesn't
+ * matter if we include or exclude hints during the copy, as long
+ * as we write a valid page and associated checksum.
+ */
+ memcpy((char *) pageCopy, (char *) page, BLCKSZ);
+ PageSetChecksumInplace(pageCopy, blkno);
+ return (char *) pageCopy;
+ }
+
+ /*
+ * Set checksum for page in private memory.
+ *
+ * This is a simpler version of PageSetChecksumCopy(). The more explicit API
+ * allows us to more easily see if we're making the correct call and reduces
+ * the amount of additional code specific to page verification.
+ */
+ void
+ PageSetChecksumInplace(Page page, BlockNumber blkno)
+ {
+ if (PageIsNew(page))
+ return;
+
+ if (DataChecksumsEnabled())
+ {
+ PageHeader p = (PageHeader) page;
+ p->pd_checksum = PageCalcChecksum16(page, blkno);
+ }
+
+ return;
+ }
+
+ /*
+ * Calculate checksum for a PostgreSQL Page. This includes the page number (to
+ * detect the case when a page is somehow moved to a different location), the
+ * page header (excluding the checksum itself), and the page data.
+ *
+ * The checksum algorithm is a modified Fletcher 64-bit (which is
+ * order-sensitive). The modification is because, at the end, we have two
+ * 64-bit sums, but we only have room for a 16-bit checksum. So, instead of
+ * using a modulus of 2^32 - 1, we use 2^8 - 1; making it also resemble a
+ * Fletcher 16-bit. We don't use Fletcher 16-bit directly, because processing
+ * single bytes at a time is slower.
+ */
+ static uint16
+ PageCalcChecksum16(Page page, BlockNumber blkno)
+ {
+ PageHeaderData header_copy;
+ uint32 *ptr32Header = (uint32 *) &header_copy;
+ uint32 *ptr32Page = (uint32 *) page;
+ int64 sum1 = 0;
+ int64 sum2 = 0;
+ uint16 checksum = 0;
+ uint8 *p8Checksum = (uint8 *) &checksum;
+ int i;
+
+ /* only calculate the checksum for properly-initialized pages */
+ Assert(!PageIsNew(page));
+
+ /*
+ * Initialize the checksum calculation with the page number. This helps
+ * catch corruption from whole pages being transposed with other whole
+ * pages.
+ */
+ sum1 = sum2 = (uint64) blkno;
+
+ /*
+ * Make a copy of the page header and set the checksum to zero in the
+ * copy. That allows us to calculate the checksum 32 bits at a time while
+ * ignoring only the checksum field during calculation.
+ */
+ memcpy(&header_copy, page, SizeOfPageHeaderData);
+ header_copy.pd_checksum = 0;
+
+ /* compute the checksum of the header */
+ for (i = 0; i < SizeOfPageHeaderData / sizeof(uint32); i++)
+ {
+ sum1 += ptr32Header[i];
+ sum2 += sum1;
+ }
+
+ /* now checksum the rest of the page */
+ for (i = SizeOfPageHeaderData; i < BLCKSZ / sizeof(uint32); i++)
+ {
+ sum1 += ptr32Page[i];
+ sum2 += sum1;
+
+ /*
+ * Testing for overflow makes the algorithm slower, but we know that
+ * overflow won't happen, so only use an Assert. The overflow won't
+ * happen because sum2 (the larger sum) can grow to a maximum of:
+ *
+ * 2^32 * (N^2 - N)/2
+ *
+ * where N is the number of iterations of this loop. The largest block
+ * size is 32KB, which is 8192 iterations, which yields a number less
+ * than 2^61, which is still within the range of a signed int64.
+ */
+ Assert(BLCKSZ <= 32768 && sum1 >=0 && sum2 >= 0);
+ }
+
+ /*
+ * Store the sums as bytes in the checksum. We add one to shift the range
+ * from 0..255 to 1..256, to make zero invalid for checksum bytes (which
+ * seems wise).
+ */
+ p8Checksum[0] = (sum1 % 255) + 1;
+ p8Checksum[1] = (sum2 % 255) + 1;
+
+ #ifdef DEBUG_CHECKSUM
+ elog(LOG, "checksum %u", checksum);
+ #endif
+
+ return checksum;
+ }
*** a/src/backend/utils/time/tqual.c
--- b/src/backend/utils/time/tqual.c
***************
*** 6,12 ****
* NOTE: all the HeapTupleSatisfies routines will update the tuple's
* "hint" status bits if we see that the inserting or deleting transaction
* has now committed or aborted (and it is safe to set the hint bits).
! * If the hint bits are changed, SetBufferCommitInfoNeedsSave is called on
* the passed-in buffer. The caller must hold not only a pin, but at least
* shared buffer content lock on the buffer containing the tuple.
*
--- 6,12 ----
* NOTE: all the HeapTupleSatisfies routines will update the tuple's
* "hint" status bits if we see that the inserting or deleting transaction
* has now committed or aborted (and it is safe to set the hint bits).
! * If the hint bits are changed, MarkBufferDirtyHint is called on
* the passed-in buffer. The caller must hold not only a pin, but at least
* shared buffer content lock on the buffer containing the tuple.
*
***************
*** 121,127 **** SetHintBits(HeapTupleHeader tuple, Buffer buffer,
}
tuple->t_infomask |= infomask;
! SetBufferCommitInfoNeedsSave(buffer);
}
/*
--- 121,127 ----
}
tuple->t_infomask |= infomask;
! MarkBufferDirtyHint(buffer);
}
/*
*** a/src/bin/initdb/initdb.c
--- b/src/bin/initdb/initdb.c
***************
*** 120,125 **** static bool noclean = false;
--- 120,126 ----
static bool do_sync = true;
static bool sync_only = false;
static bool show_setting = false;
+ static bool data_checksums = false;
static char *xlog_dir = "";
***************
*** 1471,1478 **** bootstrap_template1(void)
unsetenv("PGCLIENTENCODING");
snprintf(cmd, sizeof(cmd),
! "\"%s\" --boot -x1 %s %s",
! backend_exec, boot_options, talkargs);
PG_CMD_OPEN;
--- 1472,1481 ----
unsetenv("PGCLIENTENCODING");
snprintf(cmd, sizeof(cmd),
! "\"%s\" --boot -x1 %s %s %s",
! backend_exec,
! data_checksums ? "-k" : "",
! boot_options, talkargs);
PG_CMD_OPEN;
***************
*** 2778,2783 **** usage(const char *progname)
--- 2781,2787 ----
printf(_(" -X, --xlogdir=XLOGDIR location for the transaction log directory\n"));
printf(_("\nLess commonly used options:\n"));
printf(_(" -d, --debug generate lots of debugging output\n"));
+ printf(_(" -k, --data-checksums data page checksums\n"));
printf(_(" -L DIRECTORY where to find the input files\n"));
printf(_(" -n, --noclean do not clean up after errors\n"));
printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n"));
***************
*** 3426,3431 **** main(int argc, char *argv[])
--- 3430,3436 ----
{"nosync", no_argument, NULL, 'N'},
{"sync-only", no_argument, NULL, 'S'},
{"xlogdir", required_argument, NULL, 'X'},
+ {"data-checksums", no_argument, NULL, 'k'},
{NULL, 0, NULL, 0}
};
***************
*** 3457,3463 **** main(int argc, char *argv[])
/* process command-line options */
! while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sST:X:", long_options, &option_index)) != -1)
{
switch (c)
{
--- 3462,3468 ----
/* process command-line options */
! while ((c = getopt_long(argc, argv, "dD:E:kL:nNU:WA:sST:X:", long_options, &option_index)) != -1)
{
switch (c)
{
***************
*** 3506,3511 **** main(int argc, char *argv[])
--- 3511,3519 ----
case 'S':
sync_only = true;
break;
+ case 'k':
+ data_checksums = true;
+ break;
case 'L':
share_path = pg_strdup(optarg);
break;
*** a/src/bin/pg_controldata/pg_controldata.c
--- b/src/bin/pg_controldata/pg_controldata.c
***************
*** 282,286 **** main(int argc, char *argv[])
--- 282,288 ----
(ControlFile.float4ByVal ? _("by value") : _("by reference")));
printf(_("Float8 argument passing: %s\n"),
(ControlFile.float8ByVal ? _("by value") : _("by reference")));
+ printf(_("Data page checksums: %s\n"),
+ (ControlFile.data_checksums ? _("enabled") : _("disabled")));
return 0;
}
*** a/src/bin/pg_resetxlog/pg_resetxlog.c
--- b/src/bin/pg_resetxlog/pg_resetxlog.c
***************
*** 618,623 **** PrintControlValues(bool guessed)
--- 618,625 ----
(ControlFile.float4ByVal ? _("by value") : _("by reference")));
printf(_("Float8 argument passing: %s\n"),
(ControlFile.float8ByVal ? _("by value") : _("by reference")));
+ printf(_("Data page checksums: %s\n"),
+ (ControlFile.data_checksums ? _("enabled") : _("disabled")));
}
*** a/src/include/access/heapam_xlog.h
--- b/src/include/access/heapam_xlog.h
***************
*** 279,285 **** extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid, MultiXactId cutoff_multi,
OffsetNumber *offsets, int offcnt);
! extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block,
Buffer vm_buffer, TransactionId cutoff_xid);
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
BlockNumber blk, Page page);
--- 279,285 ----
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid, MultiXactId cutoff_multi,
OffsetNumber *offsets, int offcnt);
! extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer,
Buffer vm_buffer, TransactionId cutoff_xid);
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
BlockNumber blk, Page page);
*** a/src/include/access/visibilitymap.h
--- b/src/include/access/visibilitymap.h
***************
*** 24,31 **** extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk,
extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
Buffer *vmbuf);
extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf);
! extern void visibilitymap_set(Relation rel, BlockNumber heapBlk,
! XLogRecPtr recptr, Buffer vmbuf, TransactionId cutoff_xid);
extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
extern BlockNumber visibilitymap_count(Relation rel);
extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks);
--- 24,31 ----
extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
Buffer *vmbuf);
extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf);
! extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
! XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid);
extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
extern BlockNumber visibilitymap_count(Relation rel);
extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks);
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
***************
*** 267,272 **** extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
--- 267,274 ----
extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock);
extern int XLogFileOpen(XLogSegNo segno);
+ extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer);
+
extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
extern void XLogSetAsyncXactLSN(XLogRecPtr record);
***************
*** 294,299 **** extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno);
--- 296,302 ----
extern void UpdateControlFile(void);
extern uint64 GetSystemIdentifier(void);
+ extern bool DataChecksumsEnabled(void);
extern Size XLOGShmemSize(void);
extern void XLOGShmemInit(void);
extern void BootStrapXLOG(void);
*** a/src/include/catalog/pg_control.h
--- b/src/include/catalog/pg_control.h
***************
*** 63,69 **** typedef struct CheckPoint
#define XLOG_BACKUP_END 0x50
#define XLOG_PARAMETER_CHANGE 0x60
#define XLOG_RESTORE_POINT 0x70
! #define XLOG_FPW_CHANGE 0x80
/*
--- 63,70 ----
#define XLOG_BACKUP_END 0x50
#define XLOG_PARAMETER_CHANGE 0x60
#define XLOG_RESTORE_POINT 0x70
! #define XLOG_FPW_CHANGE 0x80
! #define XLOG_HINT 0x90
/*
***************
*** 207,212 **** typedef struct ControlFileData
--- 208,216 ----
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */
+ /* Are data pages protected by checksums? */
+ bool data_checksums;
+
/* CRC of all above ... MUST BE LAST! */
pg_crc32 crc;
} ControlFileData;
*** a/src/include/storage/bufmgr.h
--- b/src/include/storage/bufmgr.h
***************
*** 203,209 **** extern Size BufferShmemSize(void);
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
ForkNumber *forknum, BlockNumber *blknum);
! extern void SetBufferCommitInfoNeedsSave(Buffer buffer);
extern void UnlockBuffers(void);
extern void LockBuffer(Buffer buffer, int mode);
--- 203,209 ----
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
ForkNumber *forknum, BlockNumber *blknum);
! extern void MarkBufferDirtyHint(Buffer buffer);
extern void UnlockBuffers(void);
extern void LockBuffer(Buffer buffer, int mode);
*** a/src/include/storage/bufpage.h
--- b/src/include/storage/bufpage.h
***************
*** 15,20 ****
--- 15,21 ----
#define BUFPAGE_H
#include "access/xlogdefs.h"
+ #include "storage/block.h"
#include "storage/item.h"
#include "storage/off.h"
***************
*** 163,176 **** typedef PageHeaderData *PageHeader;
* PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
* page for its new tuple version; this suggests that a prune is needed.
* Again, this is just a hint.
*/
#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */
#define PD_PAGE_FULL 0x0002 /* not enough free space for new
* tuple? */
#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to
* everyone */
! #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */
/*
* Page layout version number 0 is for pre-7.3 Postgres releases.
--- 164,196 ----
* PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
* page for its new tuple version; this suggests that a prune is needed.
* Again, this is just a hint.
+ *
+ * PD_CHECKSUMS1 and PD_CHECKSUMS2 indicate the presence of checksums. This
+ * allows future support for enabling/disabling the use of checksums while the
+ * system is online. There is some concern that trusting page data to say how
+ * to check page data is dangerously self-referential. To avoid falsely
+ * determining that the page has no checksum, we set two non-adjacent bits to
+ * signify that the page has a checksum and should be verified when that block
+ * is read back into a buffer. We use two bits in case a multiple bit error
+ * removes one of the checksum flags *and* destroys data, which would lead to
+ * skipping the checksum check and silently accepting bad data. We also require
+ * that a third bit (PD_HEADERCHECK) is zeroed regardless of the presence of a
+ * checksum.
*/
#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */
#define PD_PAGE_FULL 0x0002 /* not enough free space for new
* tuple? */
#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to
* everyone */
+ #define PD_CHECKSUMS1 0x0008 /* bit indicating the presence of
+ * checksums */
+ #define PD_HEADERCHECK 0x0010 /* always zero -- if set, indicates
+ * corruption */
+
+ #define PD_CHECKSUMS2 0x8000 /* bit indicating the presence of
+ * checksums */
! #define PD_VALID_FLAG_BITS 0x801F /* OR of all valid pd_flags bits */
/*
* Page layout version number 0 is for pre-7.3 Postgres releases.
***************
*** 378,384 **** do { \
*/
extern void PageInit(Page page, Size pageSize, Size specialSize);
! extern bool PageHeaderIsValid(PageHeader page);
extern OffsetNumber PageAddItem(Page page, Item item, Size size,
OffsetNumber offsetNumber, bool overwrite, bool is_heap);
extern Page PageGetTempPage(Page page);
--- 398,404 ----
*/
extern void PageInit(Page page, Size pageSize, Size specialSize);
! extern bool PageIsVerified(Page page, BlockNumber blkno);
extern OffsetNumber PageAddItem(Page page, Item item, Size size,
OffsetNumber offsetNumber, bool overwrite, bool is_heap);
extern Page PageGetTempPage(Page page);
***************
*** 391,395 **** extern Size PageGetExactFreeSpace(Page page);
--- 411,417 ----
extern Size PageGetHeapFreeSpace(Page page);
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+ extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
+ extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
#endif /* BUFPAGE_H */