From 6c4c004a2a7f5f269dc33942f7c397fe962c8685 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Wed, 8 Mar 2017 13:48:33 -0300 Subject: [PATCH 2/6] track root lp v16 --- src/backend/access/heap/heapam.c | 209 ++++++++++++++++++++++++++++------ src/backend/access/heap/hio.c | 25 +++- src/backend/access/heap/pruneheap.c | 126 ++++++++++++++++++-- src/backend/access/heap/rewriteheap.c | 21 +++- src/backend/executor/execIndexing.c | 3 +- src/backend/executor/execMain.c | 4 +- src/include/access/heapam.h | 1 + src/include/access/heapam_xlog.h | 4 +- src/include/access/hio.h | 4 +- src/include/access/htup_details.h | 97 +++++++++++++++- 10 files changed, 428 insertions(+), 66 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 74fb09c..93cde9a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -94,7 +94,8 @@ static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, - HeapTuple newtup, HeapTuple old_key_tup, + HeapTuple newtup, OffsetNumber root_offnum, + HeapTuple old_key_tup, bool all_visible_cleared, bool new_all_visible_cleared); static Bitmapset *HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols, @@ -2248,13 +2249,13 @@ heap_get_latest_tid(Relation relation, */ if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleHeaderIsOnlyLocked(tp.t_data) || - ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + HeapTupleHeaderIsHeapLatest(tp.t_data, &ctid)) { UnlockReleaseBuffer(buffer); break; } - ctid = tp.t_data->t_ctid; + HeapTupleHeaderGetNextTid(tp.t_data, &ctid); priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); UnlockReleaseBuffer(buffer); } /* end of loop */ @@ -2385,6 +2386,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, Buffer buffer; Buffer vmbuffer = InvalidBuffer; bool all_visible_cleared = false; + OffsetNumber root_offnum; /* * Fill in tuple header fields, assign an OID, and toast the tuple if @@ -2423,8 +2425,13 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); - RelationPutHeapTuple(relation, buffer, heaptup, - (options & HEAP_INSERT_SPECULATIVE) != 0); + root_offnum = RelationPutHeapTuple(relation, buffer, heaptup, + (options & HEAP_INSERT_SPECULATIVE) != 0, + InvalidOffsetNumber); + + /* We must not overwrite the speculative insertion token. */ + if ((options & HEAP_INSERT_SPECULATIVE) == 0) + HeapTupleHeaderSetHeapLatest(heaptup->t_data, root_offnum); if (PageIsAllVisible(BufferGetPage(buffer))) { @@ -2652,6 +2659,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, Size saveFreeSpace; bool need_tuple_data = RelationIsLogicallyLogged(relation); bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); + OffsetNumber root_offnum; needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, @@ -2722,7 +2730,12 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, * RelationGetBufferForTuple has ensured that the first tuple fits. * Put that on the page, and then as many other tuples as fit. */ - RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); + root_offnum = RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false, + InvalidOffsetNumber); + + /* Mark this tuple as the latest and also set root offset. */ + HeapTupleHeaderSetHeapLatest(heaptuples[ndone]->t_data, root_offnum); + for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) { HeapTuple heaptup = heaptuples[ndone + nthispage]; @@ -2730,7 +2743,10 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) break; - RelationPutHeapTuple(relation, buffer, heaptup, false); + root_offnum = RelationPutHeapTuple(relation, buffer, heaptup, false, + InvalidOffsetNumber); + /* Mark each tuple as the latest and also set root offset. */ + HeapTupleHeaderSetHeapLatest(heaptup->t_data, root_offnum); /* * We don't use heap_multi_insert for catalog tuples yet, but @@ -3002,6 +3018,7 @@ heap_delete(Relation relation, ItemPointer tid, HeapTupleData tp; Page page; BlockNumber block; + OffsetNumber offnum; Buffer buffer; Buffer vmbuffer = InvalidBuffer; TransactionId new_xmax; @@ -3012,6 +3029,7 @@ heap_delete(Relation relation, ItemPointer tid, bool all_visible_cleared = false; HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ bool old_key_copied = false; + OffsetNumber root_offnum; Assert(ItemPointerIsValid(tid)); @@ -3053,7 +3071,8 @@ heap_delete(Relation relation, ItemPointer tid, LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + offnum = ItemPointerGetOffsetNumber(tid); + lp = PageGetItemId(page, offnum); Assert(ItemIdIsNormal(lp)); tp.t_tableOid = RelationGetRelid(relation); @@ -3183,7 +3202,17 @@ l1: result == HeapTupleUpdated || result == HeapTupleBeingUpdated); Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); - hufd->ctid = tp.t_data->t_ctid; + + /* + * If we're at the end of the chain, then just return the same TID back + * to the caller. The caller uses that as a hint to know if we have hit + * the end of the chain. + */ + if (!HeapTupleHeaderIsHeapLatest(tp.t_data, &tp.t_self)) + HeapTupleHeaderGetNextTid(tp.t_data, &hufd->ctid); + else + ItemPointerCopy(&tp.t_self, &hufd->ctid); + hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); if (result == HeapTupleSelfUpdated) hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data); @@ -3232,6 +3261,22 @@ l1: xid, LockTupleExclusive, true, &new_xmax, &new_infomask, &new_infomask2); + /* + * heap_get_root_tuple_one() may call palloc, which is disallowed once we + * enter the critical section. So check if the root offset is cached in the + * tuple and if not, fetch that information hard way before entering the + * critical section. + * + * Most often and unless we are dealing with a pg-upgraded cluster, the + * root offset information should be cached. So there should not be too + * much overhead of fetching this information. Also, once a tuple is + * updated, the information will be copied to the new version. So it's not + * as if we're going to pay this price forever. + */ + if (!HeapTupleHeaderHasRootOffset(tp.t_data)) + root_offnum = heap_get_root_tuple(page, + ItemPointerGetOffsetNumber(&tp.t_self)); + START_CRIT_SECTION(); /* @@ -3259,8 +3304,10 @@ l1: HeapTupleHeaderClearHotUpdated(tp.t_data); HeapTupleHeaderSetXmax(tp.t_data, new_xmax); HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); - /* Make sure there is no forward chain link in t_ctid */ - tp.t_data->t_ctid = tp.t_self; + + /* Mark this tuple as the latest tuple in the update chain. */ + if (!HeapTupleHeaderHasRootOffset(tp.t_data)) + HeapTupleHeaderSetHeapLatest(tp.t_data, root_offnum); MarkBufferDirty(buffer); @@ -3461,6 +3508,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool old_key_copied = false; Page page; BlockNumber block; + OffsetNumber offnum; + OffsetNumber root_offnum; MultiXactStatus mxact_status; Buffer buffer, newbuf, @@ -3523,6 +3572,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, block = ItemPointerGetBlockNumber(otid); + offnum = ItemPointerGetOffsetNumber(otid); buffer = ReadBuffer(relation, block); page = BufferGetPage(buffer); @@ -3807,7 +3857,12 @@ l2: result == HeapTupleUpdated || result == HeapTupleBeingUpdated); Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); - hufd->ctid = oldtup.t_data->t_ctid; + + if (!HeapTupleHeaderIsHeapLatest(oldtup.t_data, &oldtup.t_self)) + HeapTupleHeaderGetNextTid(oldtup.t_data, &hufd->ctid); + else + ItemPointerCopy(&oldtup.t_self, &hufd->ctid); + hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); if (result == HeapTupleSelfUpdated) hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); @@ -3947,6 +4002,7 @@ l2: uint16 infomask_lock_old_tuple, infomask2_lock_old_tuple; bool cleared_all_frozen = false; + OffsetNumber root_offnum; /* * To prevent concurrent sessions from updating the tuple, we have to @@ -3974,6 +4030,14 @@ l2: Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple)); + /* + * Fetch root offset before entering the critical section. We do this + * only if the information is not already available. + */ + if (!HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = heap_get_root_tuple(page, + ItemPointerGetOffsetNumber(&oldtup.t_self)); + START_CRIT_SECTION(); /* Clear obsolete visibility flags ... */ @@ -3988,7 +4052,8 @@ l2: HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); /* temporarily make it look not-updated, but locked */ - oldtup.t_data->t_ctid = oldtup.t_self; + if (!HeapTupleHeaderHasRootOffset(oldtup.t_data)) + HeapTupleHeaderSetHeapLatest(oldtup.t_data, root_offnum); /* * Clear all-frozen bit on visibility map if needed. We could @@ -4146,6 +4211,10 @@ l2: bms_overlap(modified_attrs, id_attrs), &old_key_copied); + if (!HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = heap_get_root_tuple(page, + ItemPointerGetOffsetNumber(&(oldtup.t_self))); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -4171,6 +4240,17 @@ l2: HeapTupleSetHeapOnly(heaptup); /* Mark the caller's copy too, in case different from heaptup */ HeapTupleSetHeapOnly(newtup); + /* + * For HOT (or WARM) updated tuples, we store the offset of the root + * line pointer of this chain in the ip_posid field of the new tuple. + * Usually this information will be available in the corresponding + * field of the old tuple. But for aborted updates or pg_upgraded + * databases, we might be seeing the old-style CTID chains and hence + * the information must be obtained by hard way (we should have done + * that before entering the critical section above). + */ + if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); } else { @@ -4178,10 +4258,22 @@ l2: HeapTupleClearHotUpdated(&oldtup); HeapTupleClearHeapOnly(heaptup); HeapTupleClearHeapOnly(newtup); + root_offnum = InvalidOffsetNumber; } - RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */ - + /* insert new tuple */ + root_offnum = RelationPutHeapTuple(relation, newbuf, heaptup, false, + root_offnum); + /* + * Also mark both copies as latest and set the root offset information. If + * we're doing a HOT/WARM update, then we just copy the information from + * old tuple, if available or computed above. For regular updates, + * RelationPutHeapTuple must have returned us the actual offset number + * where the new version was inserted and we store the same value since the + * update resulted in a new HOT-chain. + */ + HeapTupleHeaderSetHeapLatest(heaptup->t_data, root_offnum); + HeapTupleHeaderSetHeapLatest(newtup->t_data, root_offnum); /* Clear obsolete visibility flags, possibly set by ourselves above... */ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); @@ -4194,7 +4286,7 @@ l2: HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); /* record address of new tuple in t_ctid of old one */ - oldtup.t_data->t_ctid = heaptup->t_self; + HeapTupleHeaderSetNextTid(oldtup.t_data, &(heaptup->t_self)); /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */ if (PageIsAllVisible(BufferGetPage(buffer))) @@ -4233,6 +4325,7 @@ l2: recptr = log_heap_update(relation, buffer, newbuf, &oldtup, heaptup, + root_offnum, old_key_tuple, all_visible_cleared, all_visible_cleared_new); @@ -4513,7 +4606,8 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, ItemId lp; Page page; Buffer vmbuffer = InvalidBuffer; - BlockNumber block; + BlockNumber block; + OffsetNumber offnum; TransactionId xid, xmax; uint16 old_infomask, @@ -4522,9 +4616,11 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, bool first_time = true; bool have_tuple_lock = false; bool cleared_all_frozen = false; + OffsetNumber root_offnum; *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); block = ItemPointerGetBlockNumber(tid); + offnum = ItemPointerGetOffsetNumber(tid); /* * Before locking the buffer, pin the visibility map page if it appears to @@ -4544,6 +4640,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); + tuple->t_self = *tid; l3: result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); @@ -4571,7 +4668,11 @@ l3: xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); infomask = tuple->t_data->t_infomask; infomask2 = tuple->t_data->t_infomask2; - ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); + + if (!HeapTupleHeaderIsHeapLatest(tuple->t_data, tid)) + HeapTupleHeaderGetNextTid(tuple->t_data, &t_ctid); + else + ItemPointerCopy(tid, &t_ctid); LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); @@ -5009,7 +5110,12 @@ failed: Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated || result == HeapTupleWouldBlock); Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); - hufd->ctid = tuple->t_data->t_ctid; + + if (!HeapTupleHeaderIsHeapLatest(tuple->t_data, tid)) + HeapTupleHeaderGetNextTid(tuple->t_data, &hufd->ctid); + else + ItemPointerCopy(tid, &hufd->ctid); + hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); if (result == HeapTupleSelfUpdated) hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); @@ -5057,6 +5163,10 @@ failed: GetCurrentTransactionId(), mode, false, &xid, &new_infomask, &new_infomask2); + if (!HeapTupleHeaderHasRootOffset(tuple->t_data)) + root_offnum = heap_get_root_tuple(page, + ItemPointerGetOffsetNumber(&tuple->t_self)); + START_CRIT_SECTION(); /* @@ -5085,7 +5195,10 @@ failed: * the tuple as well. */ if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) - tuple->t_data->t_ctid = *tid; + { + if (!HeapTupleHeaderHasRootOffset(tuple->t_data)) + HeapTupleHeaderSetHeapLatest(tuple->t_data, root_offnum); + } /* Clear only the all-frozen bit on visibility map if needed */ if (PageIsAllVisible(page) && @@ -5599,6 +5712,7 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, bool cleared_all_frozen = false; Buffer vmbuffer = InvalidBuffer; BlockNumber block; + OffsetNumber offnum; ItemPointerCopy(tid, &tupid); @@ -5607,6 +5721,8 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, new_infomask = 0; new_xmax = InvalidTransactionId; block = ItemPointerGetBlockNumber(&tupid); + offnum = ItemPointerGetOffsetNumber(&tupid); + ItemPointerCopy(&tupid, &(mytup.t_self)); if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL)) @@ -5836,7 +5952,7 @@ l4: /* if we find the end of update chain, we're done. */ if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || - ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || + HeapTupleHeaderIsHeapLatest(mytup.t_data, &mytup.t_self) || HeapTupleHeaderIsOnlyLocked(mytup.t_data)) { result = HeapTupleMayBeUpdated; @@ -5845,7 +5961,7 @@ l4: /* tail recursion */ priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); - ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); + HeapTupleHeaderGetNextTid(mytup.t_data, &tupid); UnlockReleaseBuffer(buf); if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); @@ -5962,7 +6078,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple) * Replace the speculative insertion token with a real t_ctid, pointing to * itself like it does on regular tuples. */ - htup->t_ctid = tuple->t_self; + HeapTupleHeaderSetHeapLatest(htup, offnum); /* XLOG stuff */ if (RelationNeedsWAL(relation)) @@ -6088,8 +6204,7 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); /* Clear the speculative insertion token too */ - tp.t_data->t_ctid = tp.t_self; - + HeapTupleHeaderSetHeapLatest(tp.t_data, ItemPointerGetOffsetNumber(tid)); MarkBufferDirty(buffer); /* @@ -7437,6 +7552,7 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, + OffsetNumber root_offnum, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared) { @@ -7557,6 +7673,9 @@ log_heap_update(Relation reln, Buffer oldbuf, xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + Assert(OffsetNumberIsValid(root_offnum)); + xlrec.root_offnum = root_offnum; + bufflags = REGBUF_STANDARD; if (init) bufflags |= REGBUF_WILL_INIT; @@ -8211,7 +8330,13 @@ heap_xlog_delete(XLogReaderState *record) PageClearAllVisible(page); /* Make sure there is no forward chain link in t_ctid */ - htup->t_ctid = target_tid; + if (!HeapTupleHeaderHasRootOffset(htup)) + { + OffsetNumber root_offnum; + root_offnum = heap_get_root_tuple(page, xlrec->offnum); + HeapTupleHeaderSetHeapLatest(htup, root_offnum); + } + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -8301,7 +8426,8 @@ heap_xlog_insert(XLogReaderState *record) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); - htup->t_ctid = target_tid; + + HeapTupleHeaderSetHeapLatest(htup, xlrec->offnum); if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, true, true) == InvalidOffsetNumber) @@ -8436,8 +8562,8 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); - ItemPointerSetBlockNumber(&htup->t_ctid, blkno); - ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); + + HeapTupleHeaderSetHeapLatest(htup, offnum); offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) @@ -8573,7 +8699,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Set forward chain link in t_ctid */ - htup->t_ctid = newtid; + HeapTupleHeaderSetNextTid(htup, &newtid); /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -8706,13 +8832,17 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); - /* Make sure there is no forward chain link in t_ctid */ - htup->t_ctid = newtid; offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); + /* + * Make sure the tuple is marked as the latest and root offset + * information is restored. + */ + HeapTupleHeaderSetHeapLatest(htup, xlrec->root_offnum); + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); @@ -8775,6 +8905,9 @@ heap_xlog_confirm(XLogReaderState *record) */ ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); + /* For newly inserted tuple, set root offset to itself. */ + HeapTupleHeaderSetHeapLatest(htup, offnum); + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -8838,11 +8971,17 @@ heap_xlog_lock(XLogReaderState *record) */ if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) { + ItemPointerData target_tid; + + ItemPointerSet(&target_tid, BufferGetBlockNumber(buffer), offnum); HeapTupleHeaderClearHotUpdated(htup); /* Make sure there is no forward chain link in t_ctid */ - ItemPointerSet(&htup->t_ctid, - BufferGetBlockNumber(buffer), - offnum); + if (!HeapTupleHeaderHasRootOffset(htup)) + { + OffsetNumber root_offnum; + root_offnum = heap_get_root_tuple(page, offnum); + HeapTupleHeaderSetHeapLatest(htup, root_offnum); + } } HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 6529fe3..8052519 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -31,12 +31,20 @@ * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!! * * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. + * + * The caller can optionally tell us to set the root offset to the given value. + * Otherwise, the root offset is set to the offset of the new location once its + * known. The former is used while updating an existing tuple where the caller + * tells us about the root line pointer of the chain. The latter is used + * during insertion of a new row, hence root line pointer is set to the offset + * where this tuple is inserted. */ -void +OffsetNumber RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, - bool token) + bool token, + OffsetNumber root_offnum) { Page pageHeader; OffsetNumber offnum; @@ -60,17 +68,24 @@ RelationPutHeapTuple(Relation relation, ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); /* - * Insert the correct position into CTID of the stored tuple, too (unless - * this is a speculative insertion, in which case the token is held in - * CTID field instead) + * Set block number and the root offset into CTID of the stored tuple, too + * (unless this is a speculative insertion, in which case the token is held + * in CTID field instead). */ if (!token) { ItemId itemId = PageGetItemId(pageHeader, offnum); Item item = PageGetItem(pageHeader, itemId); + /* Copy t_ctid to set the correct block number. */ ((HeapTupleHeader) item)->t_ctid = tuple->t_self; + + if (!OffsetNumberIsValid(root_offnum)) + root_offnum = offnum; + HeapTupleHeaderSetHeapLatest((HeapTupleHeader) item, root_offnum); } + + return root_offnum; } /* diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index d69a266..f54337c 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -55,6 +55,8 @@ static void heap_prune_record_redirect(PruneState *prstate, static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); +static void heap_get_root_tuples_internal(Page page, + OffsetNumber target_offnum, OffsetNumber *root_offsets); /* * Optionally prune and repair fragmentation in the specified page. @@ -553,6 +555,17 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, if (!HeapTupleHeaderIsHotUpdated(htup)) break; + + /* + * If the tuple was HOT-updated and the update was later + * aborted, someone could mark this tuple to be the last tuple + * in the chain, without clearing the HOT-updated flag. So we must + * check if this is the last tuple in the chain and stop following the + * CTID, else we risk getting into an infinite recursion (though + * prstate->marked[] currently protects against that). + */ + if (HeapTupleHeaderHasRootOffset(htup)) + break; /* * Advance to next chain member. */ @@ -726,27 +739,47 @@ heap_page_prune_execute(Buffer buffer, /* - * For all items in this page, find their respective root line pointers. - * If item k is part of a HOT-chain with root at item j, then we set - * root_offsets[k - 1] = j. + * Either for all items in this page or for the given item, find their + * respective root line pointers. * - * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. - * We zero out all unused entries. + * When target_offnum is a valid offset number, the caller is interested in + * just one item. In that case, the root line pointer is returned in + * root_offsets. + * + * When target_offnum is a InvalidOffsetNumber then the caller wants to know + * the root line pointers of all the items in this page. The root_offsets array + * must have MaxHeapTuplesPerPage entries in that case. If item k is part of a + * HOT-chain with root at item j, then we set root_offsets[k - 1] = j. We zero + * out all unused entries. * * The function must be called with at least share lock on the buffer, to * prevent concurrent prune operations. * + * This is not a cheap function since it must scan through all line pointers + * and tuples on the page in order to find the root line pointers. To minimize + * the cost, we break early if target_offnum is specified and root line pointer + * to target_offnum is found. + * * Note: The information collected here is valid only as long as the caller * holds a pin on the buffer. Once pin is released, a tuple might be pruned * and reused by a completely unrelated tuple. + * + * Note: This function must not be called inside a critical section because it + * internally calls HeapTupleHeaderGetUpdateXid which somewhere down the stack + * may try to allocate heap memory. Memory allocation is disallowed in a + * critical section. */ -void -heap_get_root_tuples(Page page, OffsetNumber *root_offsets) +static void +heap_get_root_tuples_internal(Page page, OffsetNumber target_offnum, + OffsetNumber *root_offsets) { OffsetNumber offnum, maxoff; - MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); + if (OffsetNumberIsValid(target_offnum)) + *root_offsets = InvalidOffsetNumber; + else + MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) @@ -774,9 +807,28 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) /* * This is either a plain tuple or the root of a HOT-chain. - * Remember it in the mapping. + * + * If the target_offnum is specified and if we found its mapping, + * return. */ - root_offsets[offnum - 1] = offnum; + if (OffsetNumberIsValid(target_offnum)) + { + if (target_offnum == offnum) + { + root_offsets[0] = offnum; + return; + } + /* + * No need to remember mapping for any other item. The + * root_offsets array may not even has place for them. So be + * careful about not writing past the array. + */ + } + else + { + /* Remember it in the mapping. */ + root_offsets[offnum - 1] = offnum; + } /* If it's not the start of a HOT-chain, we're done with it */ if (!HeapTupleHeaderIsHotUpdated(htup)) @@ -817,15 +869,65 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) break; - /* Remember the root line pointer for this item */ - root_offsets[nextoffnum - 1] = offnum; + /* + * If target_offnum is specified and we found its mapping, return. + */ + if (OffsetNumberIsValid(target_offnum)) + { + if (nextoffnum == target_offnum) + { + root_offsets[0] = offnum; + return; + } + /* + * No need to remember mapping for any other item. The + * root_offsets array may not even has place for them. So be + * careful about not writing past the array. + */ + } + else + { + /* Remember the root line pointer for this item. */ + root_offsets[nextoffnum - 1] = offnum; + } /* Advance to next chain member, if any */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; + /* + * If the tuple was HOT-updated and the update was later aborted, + * someone could mark this tuple to be the last tuple in the chain + * and store root offset in CTID, without clearing the HOT-updated + * flag. So we must check if CTID is actually root offset and break + * to avoid infinite recursion. + */ + if (HeapTupleHeaderHasRootOffset(htup)) + break; + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); } } } + +/* + * Get root line pointer for the given tuple. + */ +OffsetNumber +heap_get_root_tuple(Page page, OffsetNumber target_offnum) +{ + OffsetNumber offnum = InvalidOffsetNumber; + heap_get_root_tuples_internal(page, target_offnum, &offnum); + return offnum; +} + +/* + * Get root line pointers for all tuples in the page + */ +void +heap_get_root_tuples(Page page, OffsetNumber *root_offsets) +{ + return heap_get_root_tuples_internal(page, InvalidOffsetNumber, + root_offsets); +} diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index c7b283c..0792971 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -419,14 +419,18 @@ rewrite_heap_tuple(RewriteState state, */ if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && - !(ItemPointerEquals(&(old_tuple->t_self), - &(old_tuple->t_data->t_ctid)))) + !(HeapTupleHeaderIsHeapLatest(old_tuple->t_data, &old_tuple->t_self))) { OldToNewMapping mapping; memset(&hashkey, 0, sizeof(hashkey)); hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); - hashkey.tid = old_tuple->t_data->t_ctid; + + /* + * We've already checked that this is not the last tuple in the chain, + * so fetch the next TID in the chain. + */ + HeapTupleHeaderGetNextTid(old_tuple->t_data, &hashkey.tid); mapping = (OldToNewMapping) hash_search(state->rs_old_new_tid_map, &hashkey, @@ -439,7 +443,7 @@ rewrite_heap_tuple(RewriteState state, * set the ctid of this tuple to point to the new location, and * insert it right away. */ - new_tuple->t_data->t_ctid = mapping->new_tid; + HeapTupleHeaderSetNextTid(new_tuple->t_data, &mapping->new_tid); /* We don't need the mapping entry anymore */ hash_search(state->rs_old_new_tid_map, &hashkey, @@ -525,7 +529,7 @@ rewrite_heap_tuple(RewriteState state, new_tuple = unresolved->tuple; free_new = true; old_tid = unresolved->old_tid; - new_tuple->t_data->t_ctid = new_tid; + HeapTupleHeaderSetNextTid(new_tuple->t_data, &new_tid); /* * We don't need the hash entry anymore, but don't free its @@ -731,7 +735,12 @@ raw_heap_insert(RewriteState state, HeapTuple tup) newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); - onpage_tup->t_ctid = tup->t_self; + /* + * Set t_ctid just to ensure that block number is copied correctly, but + * then immediately mark the tuple as the latest. + */ + HeapTupleHeaderSetNextTid(onpage_tup, &tup->t_self); + HeapTupleHeaderSetHeapLatest(onpage_tup, newoff); } /* If heaptup is a private copy, release it. */ diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 5242dee..2142273 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -789,7 +789,8 @@ retry: DirtySnapshot.speculativeToken && TransactionIdPrecedes(GetCurrentTransactionId(), xwait)))) { - ctid_wait = tup->t_data->t_ctid; + if (!HeapTupleHeaderIsHeapLatest(tup->t_data, &tup->t_self)) + HeapTupleHeaderGetNextTid(tup->t_data, &ctid_wait); reason_wait = indexInfo->ii_ExclusionOps ? XLTW_RecheckExclusionConstr : XLTW_InsertIndex; index_endscan(index_scan); diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index f5cd65d..44a501f 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2592,7 +2592,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * As above, it should be safe to examine xmax and t_ctid without the * buffer content lock, because they can't be changing. */ - if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid)) + if (HeapTupleHeaderIsHeapLatest(tuple.t_data, &tuple.t_self)) { /* deleted, so forget about it */ ReleaseBuffer(buffer); @@ -2600,7 +2600,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, } /* updated, so look at the updated row */ - tuple.t_self = tuple.t_data->t_ctid; + HeapTupleHeaderGetNextTid(tuple.t_data, &tuple.t_self); /* updated row should have xmin matching this xmax */ priorXmax = HeapTupleHeaderGetUpdateXid(tuple.t_data); ReleaseBuffer(buffer); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index a864f78..95aa976 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -189,6 +189,7 @@ extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused); +extern OffsetNumber heap_get_root_tuple(Page page, OffsetNumber target_offnum); extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); /* in heap/syncscan.c */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index b285f17..e6019d5 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -193,6 +193,8 @@ typedef struct xl_heap_update uint8 flags; TransactionId new_xmax; /* xmax of the new tuple */ OffsetNumber new_offnum; /* new tuple's offset */ + OffsetNumber root_offnum; /* offset of the root line pointer in case of + HOT or WARM update */ /* * If XLOG_HEAP_CONTAINS_OLD_TUPLE or XLOG_HEAP_CONTAINS_OLD_KEY flags are @@ -200,7 +202,7 @@ typedef struct xl_heap_update */ } xl_heap_update; -#define SizeOfHeapUpdate (offsetof(xl_heap_update, new_offnum) + sizeof(OffsetNumber)) +#define SizeOfHeapUpdate (offsetof(xl_heap_update, root_offnum) + sizeof(OffsetNumber)) /* * This is what we need to know about vacuum page cleanup/redirect diff --git a/src/include/access/hio.h b/src/include/access/hio.h index 2824f23..921cb37 100644 --- a/src/include/access/hio.h +++ b/src/include/access/hio.h @@ -35,8 +35,8 @@ typedef struct BulkInsertStateData } BulkInsertStateData; -extern void RelationPutHeapTuple(Relation relation, Buffer buffer, - HeapTuple tuple, bool token); +extern OffsetNumber RelationPutHeapTuple(Relation relation, Buffer buffer, + HeapTuple tuple, bool token, OffsetNumber root_offnum); extern Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index a6c7e31..7552186 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -260,13 +260,19 @@ struct HeapTupleHeaderData * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x1800 are available */ +/* bits 0x0800 are available */ +#define HEAP_LATEST_TUPLE 0x1000 /* + * This is the last tuple in chain and + * ip_posid points to the root line + * pointer + */ #define HEAP_KEYS_UPDATED 0x2000 /* tuple was updated and key cols * modified, or tuple deleted */ #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xE000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xF000 /* visibility-related bits */ + /* * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins. It is @@ -504,6 +510,43 @@ do { \ ((tup)->t_infomask2 & HEAP_ONLY_TUPLE) != 0 \ ) +/* + * Mark this as the last tuple in the HOT chain. Before PG v10 we used to store + * the TID of the tuple itself in t_ctid field to mark the end of the chain. + * But starting PG v10, we use a special flag HEAP_LATEST_TUPLE to identify the + * last tuple and store the root line pointer of the HOT chain in t_ctid field + * instead. + * + * Note: beware of multiple evaluations of "tup" argument. + */ +#define HeapTupleHeaderSetHeapLatest(tup, offnum) \ +do { \ + AssertMacro(OffsetNumberIsValid(offnum)); \ + (tup)->t_infomask2 |= HEAP_LATEST_TUPLE; \ + ItemPointerSetOffsetNumber(&(tup)->t_ctid, (offnum)); \ +} while (0) + +#define HeapTupleHeaderClearHeapLatest(tup) \ +( \ + (tup)->t_infomask2 &= ~HEAP_LATEST_TUPLE \ +) + +/* + * Starting from PostgreSQL 10, the latest tuple in an update chain has + * HEAP_LATEST_TUPLE set; but tuples upgraded from earlier versions do not. + * For those, we determine whether a tuple is latest by testing that its t_ctid + * points to itself. + * + * Note: beware of multiple evaluations of "tup" and "tid" arguments. + */ +#define HeapTupleHeaderIsHeapLatest(tup, tid) \ +( \ + (((tup)->t_infomask2 & HEAP_LATEST_TUPLE) != 0) || \ + ((ItemPointerGetBlockNumber(&(tup)->t_ctid) == ItemPointerGetBlockNumber(tid)) && \ + (ItemPointerGetOffsetNumber(&(tup)->t_ctid) == ItemPointerGetOffsetNumber(tid))) \ +) + + #define HeapTupleHeaderSetHeapOnly(tup) \ ( \ (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \ @@ -542,6 +585,56 @@ do { \ /* + * Set the t_ctid chain and also clear the HEAP_LATEST_TUPLE flag since we + * now have a new tuple in the chain and this is no longer the last tuple of + * the chain. + * + * Note: beware of multiple evaluations of "tup" argument. + */ +#define HeapTupleHeaderSetNextTid(tup, tid) \ +do { \ + ItemPointerCopy((tid), &((tup)->t_ctid)); \ + HeapTupleHeaderClearHeapLatest((tup)); \ +} while (0) + +/* + * Get TID of next tuple in the update chain. Caller must have checked that + * we are not already at the end of the chain because in that case t_ctid may + * actually store the root line pointer of the HOT chain. + * + * Note: beware of multiple evaluations of "tup" argument. + */ +#define HeapTupleHeaderGetNextTid(tup, next_ctid) \ +do { \ + AssertMacro(!((tup)->t_infomask2 & HEAP_LATEST_TUPLE)); \ + ItemPointerCopy(&(tup)->t_ctid, (next_ctid)); \ +} while (0) + +/* + * Get the root line pointer of the HOT chain. The caller should have confirmed + * that the root offset is cached before calling this macro. + * + * Note: beware of multiple evaluations of "tup" argument. + */ +#define HeapTupleHeaderGetRootOffset(tup) \ +( \ + AssertMacro(((tup)->t_infomask2 & HEAP_LATEST_TUPLE) != 0), \ + ItemPointerGetOffsetNumber(&(tup)->t_ctid) \ +) + +/* + * Return whether the tuple has a cached root offset. We don't use + * HeapTupleHeaderIsHeapLatest because that one also considers the case of + * t_ctid pointing to itself, for tuples migrated from pre v10 clusters. Here + * we are only interested in the tuples which are marked with HEAP_LATEST_TUPLE + * flag. + */ +#define HeapTupleHeaderHasRootOffset(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_LATEST_TUPLE) != 0 \ +) + +/* * BITMAPLEN(NATTS) - * Computes size of null bitmap given number of data columns. */ -- 2.1.4