From ecd061fdcaab721454046685f9209943ab7910f7 Mon Sep 17 00:00:00 2001 From: Antonin Houska Date: Wed, 11 Dec 2024 19:22:42 +0100 Subject: [PATCH 4/8] Add CONCURRENTLY option to both VACUUM FULL and CLUSTER commands. Both VACUUM FULL and CLUSTER commands copy the relation data into a new file, create new indexes and eventually swap the files. To make sure that the old file does not change during the copying, the relation is locked in an exclusive mode, which prevents applications from both reading and writing. (To keep the data consistent, we'd only need to prevent the applications from writing, but even reading needs to be blocked before we can swap the files - otherwise some applications could continue using the old file. Since we cannot get stronger lock without releasing the weaker one first, we acquire the exclusive lock in the beginning and keep it till the end of the processing.) This patch introduces an alternative workflow, which only requires the exclusive lock when the relation (and index) files are being swapped. (Supposedly, the swapping should be pretty fast.) On the other hand, when we copy the data to the new file, we allow applications to read from the relation and even write into it. First, we scan the relation using a "historic snapshot", and insert all the tuples satisfying this snapshot into the new file. Note that, before creating that snapshot, we need to make sure that all the other backends treat the relation as a system catalog: in particular, they must log information on new command IDs (CIDs). We achieve that by adding the relation ID into a shared hash table and waiting until all the transactions currently writing into the table (i.e. transactions possibly not aware of the new entry) have finished. Second, logical decoding is used to capture the data changes done by applications during the copying (i.e. changes that do not satisfy the historic snapshot mentioned above), and those are applied to the new file before we acquire the exclusive lock we need to swap the files. (Of course, more data changes can take place while we are waiting for the lock - these will be applied to the new file after we have acquired the lock, before we swap the files.) While copying the data into the new file, we hold a lock that prevents applications from changing the relation tuple descriptor (tuples inserted into the old file must fit into the new file). However, as we have to release that lock before getting the exclusive one, it's possible that someone adds or drops a column, or changes the data type of an existing one. Therefore we have to check the tuple descriptor before we swap the files. If we find out that the tuple descriptor changed, ERROR is raised and all the changes are rolled back. Since a lot of effort can be wasted in such a case, the ALTER TABLE command also tries to check if VACUUM FULL / CLUSTER with the CONCURRENTLY option is running on the same relation, and raises an ERROR if it is. Like the existing implementation of both VACUUM FULL and CLUSTER commands, the variant with the CONCURRENTLY option also requires an extra space for the new relation and index files (which coexist with the old files for some time). In addition, the CONCURRENTLY option might introduce a lag in releasing WAL segments for archiving / recycling. This is due to the decoding of the data changes done by application concurrently. However, this lag should not be more than a single WAL segment. --- doc/src/sgml/monitoring.sgml | 36 +- doc/src/sgml/ref/cluster.sgml | 116 +- doc/src/sgml/ref/vacuum.sgml | 22 +- src/Makefile | 1 + src/backend/access/heap/heapam.c | 8 +- src/backend/access/heap/heapam_handler.c | 145 +- src/backend/access/heap/heapam_visibility.c | 30 +- src/backend/catalog/index.c | 43 +- src/backend/catalog/system_views.sql | 17 +- src/backend/commands/cluster.c | 2572 ++++++++++++++++- src/backend/commands/matview.c | 2 +- src/backend/commands/tablecmds.c | 11 + src/backend/commands/vacuum.c | 126 +- src/backend/meson.build | 1 + src/backend/replication/logical/decode.c | 24 + src/backend/replication/logical/snapbuild.c | 20 + .../replication/pgoutput_cluster/Makefile | 32 + .../replication/pgoutput_cluster/meson.build | 18 + .../pgoutput_cluster/pgoutput_cluster.c | 288 ++ src/backend/storage/ipc/ipci.c | 3 + src/backend/tcop/utility.c | 11 + src/backend/utils/activity/backend_progress.c | 16 + .../utils/activity/wait_event_names.txt | 1 + src/backend/utils/cache/inval.c | 22 + src/backend/utils/cache/relcache.c | 5 + src/backend/utils/time/snapmgr.c | 3 +- src/bin/psql/tab-complete.in.c | 5 +- src/include/access/heapam.h | 4 + src/include/access/tableam.h | 10 + src/include/catalog/index.h | 3 + src/include/commands/cluster.h | 94 +- src/include/commands/progress.h | 17 +- src/include/commands/vacuum.h | 17 +- src/include/replication/snapbuild.h | 1 + src/include/storage/lockdefs.h | 5 +- src/include/storage/lwlocklist.h | 1 + src/include/utils/backend_progress.h | 3 +- src/include/utils/inval.h | 2 + src/include/utils/rel.h | 7 +- src/include/utils/snapmgr.h | 2 + src/test/regress/expected/rules.out | 17 +- 41 files changed, 3563 insertions(+), 198 deletions(-) create mode 100644 src/backend/replication/pgoutput_cluster/Makefile create mode 100644 src/backend/replication/pgoutput_cluster/meson.build create mode 100644 src/backend/replication/pgoutput_cluster/pgoutput_cluster.c diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 840d7f8161..6abf639b3e 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -5688,14 +5688,35 @@ FROM pg_stat_get_backend_idset() AS backendid; - heap_tuples_written bigint + heap_tuples_inserted bigint - Number of heap tuples written. + Number of heap tuples inserted. This counter only advances when the phase is seq scanning heap, - index scanning heap - or writing new heap. + index scanning heap, + writing new heap + or catch-up. + + + + + + heap_tuples_updated bigint + + + Number of heap tuples updated. + This counter only advances when the phase is catch-up. + + + + + + heap_tuples_deleted bigint + + + Number of heap tuples deleted. + This counter only advances when the phase is catch-up. @@ -5776,6 +5797,13 @@ FROM pg_stat_get_backend_idset() AS backendid; CLUSTER is currently writing the new heap. + + catch-up + + CLUSTER is currently processing the DML commands + that other transactions executed during any of the preceding phase. + + swapping relation files diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml index c5760244e6..526f0c5843 100644 --- a/doc/src/sgml/ref/cluster.sgml +++ b/doc/src/sgml/ref/cluster.sgml @@ -26,6 +26,7 @@ CLUSTER [ ( option [, ...] ) ] [ where option can be one of: VERBOSE [ boolean ] + CONCURRENTLY [ boolean ] @@ -69,14 +70,17 @@ CLUSTER [ ( option [, ...] ) ] [ table_name reclusters all the previously-clustered tables in the current database that the calling user has privileges for. This form of CLUSTER cannot be - executed inside a transaction block. + executed inside a transaction block. Also, this form is not allowed if + the CONCURRENTLY option is used. - When a table is being clustered, an ACCESS - EXCLUSIVE lock is acquired on it. This prevents any other - database operations (both reads and writes) from operating on the - table until the CLUSTER is finished. + When a table is being clustered, an ACCESS EXCLUSIVE + lock is acquired on it. This prevents any other database operations (both + reads and writes) from operating on the table until + the CLUSTER is finished. If you want to keep the table + accessible during the clustering, consider using + the CONCURRENTLY option. @@ -111,6 +115,108 @@ CLUSTER [ ( option [, ...] ) ] [ + + CONCURRENTLY + + + Allow other transactions to use the table while it is being clustered. + + + + Internally, CLUSTER copies the contents of the table + (ignoring dead tuples) into a new file, sorted by the specified index, + and also creates a new file for each index. Then it swaps the old and + new files for the table and all the indexes, and deletes the old + files. The ACCESS EXCLUSIVE lock is needed to make + sure that the old files do not change during the processing because the + changes would get lost due to the swap. + + + + With the CONCURRENTLY option, the ACCESS + EXCLUSIVE lock is only acquired to swap the table and index + files. The data changes that took place during the creation of the new + table and index files are captured using logical decoding + () and applied before + the ACCESS EXCLUSIVE lock is requested. Thus the lock + is typically held only for the time needed to swap the files, which + should be pretty short. + + + + Note that CLUSTER with the + the CONCURRENTLY option does not try to order the + rows inserted into the table after the clustering started. Also + note CLUSTER might fail to complete due to DDL + commands executed on the table by other transactions during the + clustering. + + + + + In addition to the temporary space requirements explained below, + the CONCURRENTLY option can add to the usage of + temporary space a bit more. The reason is that other transactions can + perform DML operations which cannot be applied to the new file until + CLUSTER has copied all the tuples from the old + file. Thus the tuples inserted into the old file during the copying are + also stored in separately in a temporary file, so they can eventually + be applied to the new file. + + + + Furthermore, the data changes performed during the copying are + extracted from write-ahead log (WAL), and + this extraction (decoding) only takes place when certain amount of WAL + has been written. Therefore, WAL removal can be delayed by this + threshold. Currently the threshold is equal to the value of + the wal_segment_size + configuration parameter. + + + + + The CONCURRENTLY option cannot be used in the + following cases: + + + + + The table is partitioned. + + + + + + The table is a system catalog or a TOAST table. + + + + + + CLUSTER is executed inside a transaction block. + + + + + + The wal_level + configuration parameter is less than logical. + + + + + + The max_replication_slots + configuration parameter does not allow for creation of an additional + replication slot. + + + + + + + boolean diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index 9110938fab..da2bcd85c0 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -39,6 +39,7 @@ VACUUM [ ( option [, ...] ) ] [ boolean ] ONLY_DATABASE_STATS [ boolean ] BUFFER_USAGE_LIMIT size + CONCURRENTLY [ boolean ] and table_and_columns is: @@ -62,7 +63,8 @@ VACUUM [ ( option [, ...] ) ] [ table_and_columns list, VACUUM processes every table and materialized view in the current database that the current user has permission to vacuum. - With a list, VACUUM processes only those table(s). + With a list, VACUUM processes only those table(s). The + list is required if the CONCURRENTLY option is used. @@ -360,6 +362,24 @@ VACUUM [ ( option [, ...] ) ] [ + + CONCURRENTLY + + + Allow other transactions to use the table while it is being vacuumed. If + this option is specified, VACUUM can only process + tables which have already been clustered. For more information, see the + description of the CONCURRENTLY option of the + command. + + + + The CONCURRENTLY option can only be used + if FULL is used at the same time. + + + + boolean diff --git a/src/Makefile b/src/Makefile index 2f31a2f20a..8b9d30ff72 100644 --- a/src/Makefile +++ b/src/Makefile @@ -23,6 +23,7 @@ SUBDIRS = \ interfaces \ backend/replication/libpqwalreceiver \ backend/replication/pgoutput \ + backend/replication/pgoutput_cluster \ fe_utils \ bin \ pl \ diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index d00300c5dc..a842b84415 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2070,8 +2070,14 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* * If this is a catalog, we need to transmit combo CIDs to properly * decode, so log that as well. + * + * For the main heap (as opposed to TOAST), we only receive + * HEAP_INSERT_NO_LOGICAL when doing VACUUM FULL / CLUSTER, in which + * case the visibility information does not change. Therefore, there's + * no need to update the decoding snapshot. */ - if (RelationIsAccessibleInLogicalDecoding(relation)) + if ((options & HEAP_INSERT_NO_LOGICAL) == 0 && + RelationIsAccessibleInLogicalDecoding(relation)) log_heap_new_cid(relation, heaptup); /* diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a8d95e0f1c..06cd85b34b 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -33,6 +33,7 @@ #include "catalog/index.h" #include "catalog/storage.h" #include "catalog/storage_xlog.h" +#include "commands/cluster.h" #include "commands/progress.h" #include "executor/executor.h" #include "miscadmin.h" @@ -53,6 +54,9 @@ static void reform_and_rewrite_tuple(HeapTuple tuple, static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, HeapTuple tuple, OffsetNumber tupoffset); +static HeapTuple accept_tuple_for_concurrent_copy(HeapTuple tuple, + Snapshot snapshot, + Buffer buffer); static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); @@ -681,6 +685,8 @@ static void heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, Relation OldIndex, bool use_sort, TransactionId OldestXmin, + Snapshot snapshot, + LogicalDecodingContext *decoding_ctx, TransactionId *xid_cutoff, MultiXactId *multi_cutoff, double *num_tuples, @@ -701,6 +707,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, bool *isnull; BufferHeapTupleTableSlot *hslot; BlockNumber prev_cblock = InvalidBlockNumber; + bool concurrent = snapshot != NULL; + XLogRecPtr end_of_wal_prev = GetFlushRecPtr(NULL); /* Remember if it's a system catalog */ is_system_catalog = IsSystemRelation(OldHeap); @@ -779,8 +787,10 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, for (;;) { HeapTuple tuple; + bool tuple_copied = false; Buffer buf; bool isdead; + HTSV_Result vis; CHECK_FOR_INTERRUPTS(); @@ -835,7 +845,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, LockBuffer(buf, BUFFER_LOCK_SHARE); - switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) + switch ((vis = HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))) { case HEAPTUPLE_DEAD: /* Definitely dead */ @@ -851,14 +861,15 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, case HEAPTUPLE_INSERT_IN_PROGRESS: /* - * Since we hold exclusive lock on the relation, normally the - * only way to see this is if it was inserted earlier in our - * own transaction. However, it can happen in system + * As long as we hold exclusive lock on the relation, normally + * the only way to see this is if it was inserted earlier in + * our own transaction. However, it can happen in system * catalogs, since we tend to release write lock before commit - * there. Give a warning if neither case applies; but in any - * case we had better copy it. + * there. Also, there's no exclusive lock during concurrent + * processing. Give a warning if neither case applies; but in + * any case we had better copy it. */ - if (!is_system_catalog && + if (!is_system_catalog && !concurrent && !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) elog(WARNING, "concurrent insert in progress within table \"%s\"", RelationGetRelationName(OldHeap)); @@ -870,7 +881,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, /* * Similar situation to INSERT_IN_PROGRESS case. */ - if (!is_system_catalog && + if (!is_system_catalog && !concurrent && !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) elog(WARNING, "concurrent delete in progress within table \"%s\"", RelationGetRelationName(OldHeap)); @@ -884,8 +895,6 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, break; } - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - if (isdead) { *tups_vacuumed += 1; @@ -896,9 +905,47 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, *tups_vacuumed += 1; *tups_recently_dead -= 1; } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); continue; } + if (concurrent) + { + /* + * Ignore concurrent changes now, they'll be processed later via + * logical decoding. + * + * INSERT_IN_PROGRESS is rejected right away because our snapshot + * should represent a point in time which should precede (or be + * equal to) the state of transactions as it was when the + * "SatisfiesVacuum" test was performed. Thus + * accept_tuple_for_concurrent_copy() should not consider the + * tuple inserted. + */ + if (vis == HEAPTUPLE_INSERT_IN_PROGRESS) + tuple = NULL; + else + tuple = accept_tuple_for_concurrent_copy(tuple, snapshot, + buf); + /* Tuple not suitable for the new heap? */ + if (tuple == NULL) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + continue; + } + + /* Remember that we have to free the tuple eventually. */ + tuple_copied = true; + } + + /* + * In the concurrent case, we have a copy of the tuple, so we don't + * worry whether the source tuple will be deleted / updated after we + * release the lock. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + *num_tuples += 1; if (tuplesort != NULL) { @@ -915,7 +962,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, { const int ct_index[] = { PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, - PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN + PROGRESS_CLUSTER_HEAP_TUPLES_INSERTED }; int64 ct_val[2]; @@ -930,6 +977,33 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, ct_val[1] = *num_tuples; pgstat_progress_update_multi_param(2, ct_index, ct_val); } + if (tuple_copied) + heap_freetuple(tuple); + + /* + * Process the WAL produced by the load, as well as by other + * transactions, so that the replication slot can advance and WAL does + * not pile up. Use wal_segment_size as a threshold so that we do not + * introduce the decoding overhead too often. + * + * Of course, we must not apply the changes until the initial load has + * completed. + * + * Note that our insertions into the new table should not be decoded + * as we (intentionally) do not write the logical decoding specific + * information to WAL. + */ + if (concurrent) + { + XLogRecPtr end_of_wal; + + end_of_wal = GetFlushRecPtr(NULL); + if ((end_of_wal - end_of_wal_prev) > wal_segment_size) + { + cluster_decode_concurrent_changes(decoding_ctx, end_of_wal); + end_of_wal_prev = end_of_wal; + } + } } if (indexScan != NULL) @@ -973,7 +1047,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, values, isnull, rwstate); /* Report n_tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_INSERTED, n_tuples); } @@ -2612,6 +2686,53 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +/* + * Return copy of 'tuple' if it has been inserted according to 'snapshot', or + * NULL if the insertion took place in the future. If the tuple is already + * marked as deleted or updated by a transaction that 'snapshot' still + * considers running, clear the deletion / update XID in the header of the + * copied tuple. This way the returned tuple is suitable for insertion into + * the new heap. + */ +static HeapTuple +accept_tuple_for_concurrent_copy(HeapTuple tuple, Snapshot snapshot, + Buffer buffer) +{ + HeapTuple result; + + Assert(snapshot->snapshot_type == SNAPSHOT_MVCC); + + /* + * First, check if the tuple insertion is visible by our snapshot. + */ + if (!HeapTupleMVCCInserted(tuple, snapshot, buffer)) + return NULL; + + result = heap_copytuple(tuple); + + /* + * If the tuple was deleted / updated but our snapshot still sees it, we + * need to keep it. In that case, clear the information that indicates the + * deletion / update. Otherwise the tuple chain would stay incomplete (as + * we will reject the new tuple above), and the delete / update would fail + * if executed later during logical decoding. + */ + if (TransactionIdIsNormal(HeapTupleHeaderGetRawXmax(result->t_data)) && + HeapTupleMVCCNotDeleted(result, snapshot, buffer)) + { + /* TODO More work needed here?*/ + result->t_data->t_infomask |= HEAP_XMAX_INVALID; + HeapTupleHeaderSetXmax(result->t_data, 0); + } + + /* + * Accept the tuple even if our snapshot considers it deleted - older + * snapshots can still see the tuple, while the decoded transactions + * should not try to update / delete it again. + */ + return result; +} + /* ------------------------------------------------------------------------ * Definition of the heap table access method. diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 9243feed01..d702592469 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -955,16 +955,31 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, * did TransactionIdIsInProgress in each call --- to no avail, as long as the * inserting/deleting transaction was still running --- which was more cycles * and more contention on ProcArrayLock. + * + * The checks are split into two functions, HeapTupleMVCCInserted() and + * HeapTupleMVCCNotDeleted(), because they are also useful separately. */ static bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, Buffer buffer) { - HeapTupleHeader tuple = htup->t_data; - Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); + return HeapTupleMVCCInserted(htup, snapshot, buffer) && + HeapTupleMVCCNotDeleted(htup, snapshot, buffer); +} + +/* + * HeapTupleMVCCInserted + * True iff heap tuple was successfully inserted for the given MVCC + * snapshot. + */ +bool +HeapTupleMVCCInserted(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + if (!HeapTupleHeaderXminCommitted(tuple)) { if (HeapTupleHeaderXminInvalid(tuple)) @@ -1073,6 +1088,17 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, } /* by here, the inserting transaction has committed */ + return true; +} + +/* + * HeapTupleMVCCNotDeleted + * True iff heap tuple was not deleted for the given MVCC snapshot. + */ +bool +HeapTupleMVCCNotDeleted(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ return true; diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 1c3a9e06d3..a1940c1fb9 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1416,22 +1416,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, for (int i = 0; i < newInfo->ii_NumIndexAttrs; i++) opclassOptions[i] = get_attoptions(oldIndexId, i + 1); - /* Extract statistic targets for each attribute */ - stattargets = palloc0_array(NullableDatum, newInfo->ii_NumIndexAttrs); - for (int i = 0; i < newInfo->ii_NumIndexAttrs; i++) - { - HeapTuple tp; - Datum dat; - - tp = SearchSysCache2(ATTNUM, ObjectIdGetDatum(oldIndexId), Int16GetDatum(i + 1)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for attribute %d of relation %u", - i + 1, oldIndexId); - dat = SysCacheGetAttr(ATTNUM, tp, Anum_pg_attribute_attstattarget, &isnull); - ReleaseSysCache(tp); - stattargets[i].value = dat; - stattargets[i].isnull = isnull; - } + stattargets = get_index_stattargets(oldIndexId, newInfo); /* * Now create the new index. @@ -1470,6 +1455,32 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, return newIndexId; } +NullableDatum * +get_index_stattargets(Oid indexid, IndexInfo *indInfo) +{ + NullableDatum *stattargets; + + /* Extract statistic targets for each attribute */ + stattargets = palloc0_array(NullableDatum, indInfo->ii_NumIndexAttrs); + for (int i = 0; i < indInfo->ii_NumIndexAttrs; i++) + { + HeapTuple tp; + Datum dat; + bool isnull; + + tp = SearchSysCache2(ATTNUM, ObjectIdGetDatum(indexid), Int16GetDatum(i + 1)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for attribute %d of relation %u", + i + 1, indexid); + dat = SysCacheGetAttr(ATTNUM, tp, Anum_pg_attribute_attstattarget, &isnull); + ReleaseSysCache(tp); + stattargets[i].value = dat; + stattargets[i].isnull = isnull; + } + + return stattargets; +} + /* * index_concurrently_build * diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index da9a8fe99f..3652b8a9c5 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1240,16 +1240,19 @@ CREATE VIEW pg_stat_progress_cluster AS WHEN 2 THEN 'index scanning heap' WHEN 3 THEN 'sorting tuples' WHEN 4 THEN 'writing new heap' - WHEN 5 THEN 'swapping relation files' - WHEN 6 THEN 'rebuilding index' - WHEN 7 THEN 'performing final cleanup' + WHEN 5 THEN 'catch-up' + WHEN 6 THEN 'swapping relation files' + WHEN 7 THEN 'rebuilding index' + WHEN 8 THEN 'performing final cleanup' END AS phase, CAST(S.param3 AS oid) AS cluster_index_relid, S.param4 AS heap_tuples_scanned, - S.param5 AS heap_tuples_written, - S.param6 AS heap_blks_total, - S.param7 AS heap_blks_scanned, - S.param8 AS index_rebuild_count + S.param5 AS heap_tuples_inserted, + S.param6 AS heap_tuples_updated, + S.param7 AS heap_tuples_deleted, + S.param8 AS heap_blks_total, + S.param9 AS heap_blks_scanned, + S.param10 AS index_rebuild_count FROM pg_stat_get_progress_info('CLUSTER') AS S LEFT JOIN pg_database D ON S.datid = D.oid; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 7ec605b0bd..4a4b51f77d 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -25,6 +25,10 @@ #include "access/toast_internals.h" #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" @@ -32,6 +36,7 @@ #include "catalog/namespace.h" #include "catalog/objectaccess.h" #include "catalog/pg_am.h" +#include "catalog/pg_control.h" #include "catalog/pg_inherits.h" #include "catalog/toasting.h" #include "commands/cluster.h" @@ -39,10 +44,15 @@ #include "commands/progress.h" #include "commands/tablecmds.h" #include "commands/vacuum.h" +#include "executor/executor.h" #include "miscadmin.h" #include "optimizer/optimizer.h" #include "pgstat.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/snapbuild.h" #include "storage/bufmgr.h" +#include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "utils/acl.h" @@ -56,6 +66,8 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +typedef struct RewriteStateData *RewriteState; + /* * This struct is used to pass around the information on tables to be * clustered. We need this so we can make a list of them when invoked without @@ -67,17 +79,183 @@ typedef struct Oid indexOid; } RelToCluster; +/* + * The following definitions are used for concurrent processing. + */ + +/* + * OID of the table being processed by this backend. + */ +static Oid clustered_rel = InvalidOid; +/* The same for its TOAST relation. */ +static Oid clustered_rel_toast = InvalidOid; + +/* + * The locators are used to avoid logical decoding of data that we do not need + * for our table. + */ +RelFileLocator clustered_rel_locator = {.relNumber = InvalidOid}; +RelFileLocator clustered_rel_toast_locator = {.relNumber = InvalidOid}; + +/* XXX Do we also need to mention VACUUM FULL CONCURRENTLY? */ +#define CLUSTER_IN_PROGRESS_MESSAGE \ + "relation \"%s\" is already being processed by CLUSTER CONCURRENTLY" + +/* + * Everything we need to call ExecInsertIndexTuples(). + */ +typedef struct IndexInsertState +{ + ResultRelInfo *rri; + EState *estate; + ExprContext *econtext; + + Relation ident_index; +} IndexInsertState; -static void cluster_multiple_rels(List *rtcs, ClusterParams *params); -static void rebuild_relation(Relation OldHeap, Relation index, bool verbose); +/* + * Catalog information to check if another backend changed the relation in + * such a way that makes CLUSTER CONCURRENTLY unable to continue. Such changes + * are possible because cluster_rel() has to release its lock on the relation + * in order to acquire AccessExclusiveLock that it needs to swap the relation + * files. + * + * The most obvious problem is that the tuple descriptor has changed, since + * then the tuples we try to insert into the new storage are not guaranteed to + * fit into the storage. + * + * Another problem is relfilenode changed by another backend. It's not + * necessarily a correctness issue (e.g. when the other backend ran + * cluster_rel()), but it's safer for us to terminate the table processing in + * such cases. However, this information is also needs to be checked during + * logical decoding, so we store it in global variables clustered_rel_locator + * and clustered_rel_toast_locator above. + * + * Where possible, commands which might change the relation in an incompatible + * way should check if CLUSTER CONCURRENTLY is running, before they start to + * do the actual changes (see is_concurrent_cluster_in_progress()). Anything + * else must be caught by check_catalog_changes(), which uses this structure. + */ +typedef struct CatalogState +{ + /* Tuple descriptor of the relation. */ + TupleDesc tupdesc; + + /* The number of indexes tracked. */ + int ninds; + /* The index OIDs. */ + Oid *ind_oids; + /* The index tuple descriptors. */ + TupleDesc *ind_tupdescs; + + /* The following are copies of the corresponding fields of pg_class. */ + char relpersistence; + char replident; + + /* rd_replidindex */ + Oid replidindex; +} CatalogState; + +/* The WAL segment being decoded. */ +static XLogSegNo cluster_current_segment = 0; + +static void cluster_multiple_rels(List *rtcs, ClusterParams *params, + LOCKMODE lockmode, bool isTopLevel); +static void rebuild_relation(Relation OldHeap, Relation index, bool verbose, + bool concurrent, bool is_vacuum); static void copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, + Snapshot snapshot, LogicalDecodingContext *decoding_ctx, bool verbose, bool *pSwapToastByContent, TransactionId *pFreezeXid, MultiXactId *pCutoffMulti); static List *get_tables_to_cluster(MemoryContext cluster_context); static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid); static bool cluster_is_permitted_for_relation(Oid relid, Oid userid); +static void check_concurrent_cluster_requirements(Relation rel, + bool isTopLevel, + bool isCluster); +static void begin_concurrent_cluster(Relation *rel_p, Relation *index_p, + bool *entered_p); +static void end_concurrent_cluster(bool error); +static void cluster_before_shmem_exit_callback(int code, Datum arg); +static CatalogState *get_catalog_state(Relation rel, bool is_vacuum); +static void free_catalog_state(CatalogState *state); +static void check_catalog_changes(Relation rel, CatalogState *cat_state); +static LogicalDecodingContext *setup_logical_decoding(Oid relid, + const char *slotname, + TupleDesc tupdesc); +static HeapTuple get_changed_tuple(char *change); +static void apply_concurrent_changes(ClusterDecodingState *dstate, + Relation rel, ScanKey key, int nkeys, + IndexInsertState *iistate); +static void apply_concurrent_insert(Relation rel, ConcurrentChange *change, + HeapTuple tup, IndexInsertState *iistate, + TupleTableSlot *index_slot); +static void apply_concurrent_update(Relation rel, HeapTuple tup, + HeapTuple tup_target, + ConcurrentChange *change, + IndexInsertState *iistate, + TupleTableSlot *index_slot); +static void apply_concurrent_delete(Relation rel, HeapTuple tup_target, + ConcurrentChange *change); +static HeapTuple find_target_tuple(Relation rel, ScanKey key, int nkeys, + HeapTuple tup_key, + IndexInsertState *iistate, + TupleTableSlot *ident_slot, + IndexScanDesc *scan_p); +static void process_concurrent_changes(LogicalDecodingContext *ctx, + XLogRecPtr end_of_wal, + Relation rel_dst, + Relation rel_src, + ScanKey ident_key, + int ident_key_nentries, + IndexInsertState *iistate); +static IndexInsertState *get_index_insert_state(Relation relation, + Oid ident_index_id); +static ScanKey build_identity_key(Oid ident_idx_oid, Relation rel_src, + int *nentries); +static void free_index_insert_state(IndexInsertState *iistate); +static void cleanup_logical_decoding(LogicalDecodingContext *ctx); +static void rebuild_relation_finish_concurrent(Relation NewHeap, Relation OldHeap, + Relation cl_index, + CatalogState *cat_state, + LogicalDecodingContext *ctx, + bool swap_toast_by_content, + TransactionId frozenXid, + MultiXactId cutoffMulti); +static List *build_new_indexes(Relation NewHeap, Relation OldHeap, List *OldIndexes); + +/* + * Use this API when relation needs to be unlocked, closed and re-opened. If + * the relation got dropped while being unlocked, raise ERROR that mentions + * the relation name rather than OID. + */ +typedef struct RelReopenInfo +{ + /* + * The relation to be closed. Pointer to the value is stored here so that + * the user gets his reference updated automatically on re-opening. + * + * When calling unlock_and_close_relations(), 'relid' can be passed + * instead of 'rel_p' when the caller only needs to gather information for + * subsequent opening. + */ + Relation *rel_p; + Oid relid; + char relkind; + LOCKMODE lockmode_orig; /* The existing lock mode */ + LOCKMODE lockmode_new; /* The lock mode after the relation is + * re-opened */ + + char *relname; /* Relation name, initialized automatically. */ +} RelReopenInfo; + +static void init_rel_reopen_info(RelReopenInfo *rri, Relation *rel_p, + Oid relid, LOCKMODE lockmode_orig, + LOCKMODE lockmode_new); +static void unlock_and_close_relations(RelReopenInfo *rels, int nrel); +static void reopen_relations(RelReopenInfo *rels, int nrel); /*--------------------------------------------------------------------------- * This cluster code allows for clustering multiple tables at once. Because @@ -109,10 +287,12 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) ListCell *lc; ClusterParams params = {0}; bool verbose = false; + bool concurrent = false; Relation rel = NULL; Oid indexOid = InvalidOid; MemoryContext cluster_context; List *rtcs; + LOCKMODE lockmode; /* Parse option list */ foreach(lc, stmt->params) @@ -121,6 +301,8 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) if (strcmp(opt->defname, "verbose") == 0) verbose = defGetBoolean(opt); + else if (strcmp(opt->defname, "concurrently") == 0) + concurrent = defGetBoolean(opt); else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -129,20 +311,30 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) parser_errposition(pstate, opt->location))); } - params.options = (verbose ? CLUOPT_VERBOSE : 0); + params.options = + (verbose ? CLUOPT_VERBOSE : 0) | + (concurrent ? CLUOPT_CONCURRENT : 0); + + /* + * Determine the lock mode expected by cluster_rel(). + * + * In the exclusive case, we obtain AccessExclusiveLock right away to + * avoid lock-upgrade hazard in the single-transaction case. In the + * CONCURRENT case, the AccessExclusiveLock will only be used at the end + * of processing, supposedly for very short time. Until then, we'll have + * to unlock the relation temporarily, so there's no lock-upgrade hazard. + */ + lockmode = (params.options & CLUOPT_CONCURRENT) == 0 ? + AccessExclusiveLock : ShareUpdateExclusiveLock; if (stmt->relation != NULL) { /* This is the single-relation case. */ Oid tableOid; - /* - * Find, lock, and check permissions on the table. We obtain - * AccessExclusiveLock right away to avoid lock-upgrade hazard in the - * single-transaction case. - */ + /* Find, lock, and check permissions on the table. */ tableOid = RangeVarGetRelidExtended(stmt->relation, - AccessExclusiveLock, + lockmode, 0, RangeVarCallbackMaintainsTable, NULL); @@ -194,7 +386,7 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) /* For non-partitioned tables, do what we came here to do. */ if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) { - cluster_rel(rel, indexOid, ¶ms); + cluster_rel(rel, indexOid, ¶ms, isTopLevel, false); /* cluster_rel closes the relation, but keeps lock */ return; @@ -202,10 +394,29 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) } /* - * By here, we know we are in a multi-table situation. In order to avoid - * holding locks for too long, we want to process each table in its own - * transaction. This forces us to disallow running inside a user - * transaction block. + * By here, we know we are in a multi-table situation. + * + * Concurrent processing is currently considered rather special (e.g. in + * terms of resources consumed) so it is not performed in bulk. + */ + if (params.options & CLUOPT_CONCURRENT) + { + if (rel != NULL) + { + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + ereport(ERROR, + (errmsg("CLUSTER (CONCURRENTLY) not supported for partitioned tables"), + errhint("Consider running the command for individual partitions."))); + } + else + ereport(ERROR, + (errmsg("CLUSTER (CONCURRENTLY) requires explicit table name"))); + } + + /* + * In order to avoid holding locks for too long, we want to process each + * table in its own transaction. This forces us to disallow running + * inside a user transaction block. */ PreventInTransactionBlock(isTopLevel, "CLUSTER"); @@ -230,11 +441,14 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) if (rel != NULL) { Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + /* See the ereport() above. */ + Assert((params.options & CLUOPT_CONCURRENT) == 0); + check_index_is_clusterable(rel, indexOid, AccessShareLock); rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid); /* close relation, releasing lock on parent table */ - table_close(rel, AccessExclusiveLock); + table_close(rel, lockmode); } else { @@ -243,7 +457,7 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) } /* Do the job. */ - cluster_multiple_rels(rtcs, ¶ms); + cluster_multiple_rels(rtcs, ¶ms, lockmode, isTopLevel); /* Start a new transaction for the cleanup work. */ StartTransactionCommand(); @@ -260,7 +474,8 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) * return. */ static void -cluster_multiple_rels(List *rtcs, ClusterParams *params) +cluster_multiple_rels(List *rtcs, ClusterParams *params, LOCKMODE lockmode, + bool isTopLevel) { ListCell *lc; @@ -280,10 +495,10 @@ cluster_multiple_rels(List *rtcs, ClusterParams *params) /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); - rel = table_open(rtc->tableOid, AccessExclusiveLock); + rel = table_open(rtc->tableOid, lockmode); /* Process this table */ - cluster_rel(rel, rtc->indexOid, params); + cluster_rel(rel, rtc->indexOid, params, isTopLevel, false); /* cluster_rel closes the relation, but keeps lock */ PopActiveSnapshot(); @@ -306,9 +521,16 @@ cluster_multiple_rels(List *rtcs, ClusterParams *params) * If indexOid is InvalidOid, the table will be rewritten in physical order * instead of index order. This is the new implementation of VACUUM FULL, * and error messages should refer to the operation as VACUUM not CLUSTER. + * + * Note that, in the concurrent case, the function releases the lock at some + * point, in order to get AccessExclusiveLock for the final steps (i.e. to + * swap the relation files). To make things simpler, the caller should expect + * OldHeap to be closed on return, regardless CLUOPT_CONCURRENT. (The + * AccessExclusiveLock is kept till the end of the transaction.) */ void -cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) +cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params, + bool isTopLevel, bool isVacuum) { Oid tableOid = RelationGetRelid(OldHeap); Oid save_userid; @@ -317,8 +539,46 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) bool verbose = ((params->options & CLUOPT_VERBOSE) != 0); bool recheck = ((params->options & CLUOPT_RECHECK) != 0); Relation index = NULL; + bool concurrent = ((params->options & CLUOPT_CONCURRENT) != 0); + LOCKMODE lmode; + bool entered, success; + + /* + * Check that the correct lock is held. The lock mode is + * AccessExclusiveLock for normal processing and ShareUpdateExclusiveLock + * for concurrent processing (so that SELECT, INSERT, UPDATE and DELETE + * commands work, but cluster_rel() cannot be called concurrently for the + * same relation). + */ + lmode = !concurrent ? AccessExclusiveLock : ShareUpdateExclusiveLock; + + /* + * Skip the relation if it's being processed concurrently. In such a case, + * we cannot rely on a lock because the other backend needs to release it + * temporarily at some point. + * + * This check should not take place until we have a lock that prevents + * another backend from starting VACUUM FULL / CLUSTER CONCURRENTLY after + * our check. + */ + Assert(CheckRelationLockedByMe(OldHeap, lmode, false)); + if (is_concurrent_cluster_in_progress(tableOid)) + { + ereport(NOTICE, + (errmsg(CLUSTER_IN_PROGRESS_MESSAGE, + RelationGetRelationName(OldHeap)))); + table_close(OldHeap, lmode); + return; + } + + /* There are specific requirements on concurrent processing. */ + if (concurrent) + { + check_concurrent_cluster_requirements(OldHeap, isTopLevel, + OidIsValid(indexOid)); - Assert(CheckRelationLockedByMe(OldHeap, AccessExclusiveLock, false)); + check_relation_is_clusterable_concurrently(OldHeap, isVacuum); + } /* Check for user-requested abort. */ CHECK_FOR_INTERRUPTS(); @@ -355,7 +615,7 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) /* Check that the user still has privileges for the relation */ if (!cluster_is_permitted_for_relation(tableOid, save_userid)) { - relation_close(OldHeap, AccessExclusiveLock); + relation_close(OldHeap, lmode); goto out; } @@ -370,7 +630,7 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) */ if (RELATION_IS_OTHER_TEMP(OldHeap)) { - relation_close(OldHeap, AccessExclusiveLock); + relation_close(OldHeap, lmode); goto out; } @@ -381,7 +641,7 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) */ if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) { - relation_close(OldHeap, AccessExclusiveLock); + relation_close(OldHeap, lmode); goto out; } @@ -392,7 +652,7 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 && !get_index_isclustered(indexOid)) { - relation_close(OldHeap, AccessExclusiveLock); + relation_close(OldHeap, lmode); goto out; } } @@ -408,6 +668,11 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot cluster a shared catalog"))); + /* + * The CONCURRENT case should have been rejected earlier because it does + * not support system catalogs. + */ + Assert(!(OldHeap->rd_rel->relisshared && concurrent)); /* * Don't process temp tables of other backends ... their local buffer @@ -434,7 +699,7 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) /* Check heap and index are valid to cluster on */ if (OidIsValid(indexOid)) { - check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock); + check_index_is_clusterable(OldHeap, indexOid, lmode); /* Open the index (It should already be locked.) */ index = index_open(indexOid, NoLock); } @@ -449,7 +714,8 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW && !RelationIsPopulated(OldHeap)) { - relation_close(OldHeap, AccessExclusiveLock); + index_close(index, lmode); + relation_close(OldHeap, lmode); goto out; } @@ -462,11 +728,42 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) * invalid, because we move tuples around. Promote them to relation * locks. Predicate locks on indexes will be promoted when they are * reindexed. + * + * During concurrent processing, the heap as well as its indexes stay in + * operation, so we postpone this step until they are locked using + * AccessExclusiveLock near the end of the processing. */ - TransferPredicateLocksToHeapRelation(OldHeap); + if (!concurrent) + TransferPredicateLocksToHeapRelation(OldHeap); /* rebuild_relation does all the dirty work */ - rebuild_relation(OldHeap, index, verbose); + entered = false; + success = false; + PG_TRY(); + { + /* + * For concurrent processing, make sure other transactions treat this + * table as if it was a system / user catalog, and WAL the relevant + * additional information. ERROR is raised if another backend is + * processing the same table. + */ + if (concurrent) + { + Relation *index_p = index ? &index : NULL; + + begin_concurrent_cluster(&OldHeap, index_p, &entered); + } + + rebuild_relation(OldHeap, index, verbose, concurrent, isVacuum); + success = true; + } + PG_FINALLY(); + { + if (concurrent && entered) + end_concurrent_cluster(!success); + } + PG_END_TRY(); + /* rebuild_relation closes OldHeap, and index if valid */ out: @@ -612,18 +909,84 @@ mark_index_clustered(Relation rel, Oid indexOid, bool is_internal) table_close(pg_index, RowExclusiveLock); } +/* + * Check if the CONCURRENTLY option is legal for the relation. + */ +void +check_relation_is_clusterable_concurrently(Relation rel, bool is_vacuum) +{ + char relpersistence, replident; + Oid ident_idx; + const char *cmd = is_vacuum ? "VACUUM" : "CLUSTER"; + + /* Data changes in system relations are not logically decoded. */ + if (IsCatalogRelation(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot process relation \"%s\"", + RelationGetRelationName(rel)), + errhint("%s (CONCURRENTLY) is not supported for catalog relations.", + cmd))); + + if (IsToastRelation(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot process relation \"%s\"", + RelationGetRelationName(rel)), + errhint("%s (CONCURRENTLY) is not supported for TOAST relations, unless the main relation is processed too.", + cmd))); + + relpersistence = rel->rd_rel->relpersistence; + if (relpersistence != RELPERSISTENCE_PERMANENT) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot process relation \"%s\"", + RelationGetRelationName(rel)), + errhint("%s (CONCURRENTLY) is only allowed for permanent relations.", + cmd))); + + /* With NOTHING, WAL does not contain the old tuple. */ + replident = rel->rd_rel->relreplident; + if (replident == REPLICA_IDENTITY_NOTHING) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot process relation \"%s\"", + RelationGetRelationName(rel)), + errhint("Relation \"%s\" has insufficient replication identity.", + RelationGetRelationName(rel)))); + + /* + * Identity index is not set if the replica identity is FULL, but PK might + * exist in such a case. + */ + ident_idx = RelationGetReplicaIndex(rel); + if (!OidIsValid(ident_idx) && OidIsValid(rel->rd_pkindex)) + ident_idx = rel->rd_pkindex; + if (!OidIsValid(ident_idx)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot process relation \"%s\"", + RelationGetRelationName(rel)), + (errhint("Relation \"%s\" has no identity index.", + RelationGetRelationName(rel))))); +} + /* * rebuild_relation: rebuild an existing relation in index or physical order * - * OldHeap: table to rebuild --- must be opened and exclusive-locked! + * OldHeap: table to rebuild --- must be opened and locked. See cluster_rel() + * for comments on the required lock strength. + * * index: index to cluster by, or NULL to rewrite in physical order. Must be * opened and locked. * * On exit, the heap (and also the index, if one was passed) are closed, but - * still locked with AccessExclusiveLock. + * still locked with AccessExclusiveLock. (The function handles the lock + * upgrade if 'concurrent' is true.) */ static void -rebuild_relation(Relation OldHeap, Relation index, bool verbose) +rebuild_relation(Relation OldHeap, Relation index, bool verbose, + bool concurrent, bool is_vacuum) { Oid tableOid = RelationGetRelid(OldHeap); Oid accessMethod = OldHeap->rd_rel->relam; @@ -631,10 +994,75 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose) Oid OIDNewHeap; Relation NewHeap; char relpersistence; - bool is_system_catalog; bool swap_toast_by_content; TransactionId frozenXid; MultiXactId cutoffMulti; + NameData slotname; + LogicalDecodingContext *ctx = NULL; + Snapshot snapshot = NULL; + CatalogState *cat_state = NULL; + + if (concurrent) + { + TupleDesc tupdesc; + RelReopenInfo rri[2]; + int nrel; + + /* + * CLUSTER CONCURRENTLY is not allowed in a transaction block, so this + * should never fire. + */ + Assert(GetTopTransactionIdIfAny() == InvalidTransactionId); + + /* + * A single backend should not execute multiple CLUSTER commands at a + * time, so use PID to make the slot unique. + */ + snprintf(NameStr(slotname), NAMEDATALEN, "cluster_%d", MyProcPid); + + /* + * Gather catalog information so that we can check later if the old + * relation has not changed while unlocked. + * + * Since this function also checks if the relation can be processed, + * it's important to call it before we spend notable amount of time to + * setup the logical decoding. Not sure though if it's necessary to do + * it even earlier. + */ + cat_state = get_catalog_state(OldHeap, is_vacuum); + + tupdesc = CreateTupleDescCopy(RelationGetDescr(OldHeap)); + + /* + * Unlock the relation (and possibly the clustering index) to avoid + * deadlock because setup_logical_decoding() will wait for all the + * running transactions (with XID assigned) to finish. Some of those + * transactions might be waiting for a lock on our relation. + */ + nrel = 0; + init_rel_reopen_info(&rri[nrel++], &OldHeap, InvalidOid, + ShareUpdateExclusiveLock, + ShareUpdateExclusiveLock); + if (index) + init_rel_reopen_info(&rri[nrel++], &index, InvalidOid, + ShareUpdateExclusiveLock, + ShareUpdateExclusiveLock); + unlock_and_close_relations(rri, nrel); + + /* Prepare to capture the concurrent data changes. */ + ctx = setup_logical_decoding(tableOid, NameStr(slotname), tupdesc); + + /* Lock the table (and index) again. */ + reopen_relations(rri, nrel); + + /* + * Check if a 'tupdesc' could have changed while the relation was + * unlocked. + */ + check_catalog_changes(OldHeap, cat_state); + + snapshot = SnapBuildInitialSnapshotForCluster(ctx->snapshot_builder); + } if (index) /* Mark the correct index as clustered */ @@ -642,7 +1070,6 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose) /* Remember info about rel before closing OldHeap */ relpersistence = OldHeap->rd_rel->relpersistence; - is_system_catalog = IsSystemRelation(OldHeap); /* * Create the transient table that will receive the re-ordered data. @@ -661,30 +1088,51 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose) Assert(CheckRelationLockedByMe(NewHeap, AccessExclusiveLock, false)); /* Copy the heap data into the new table in the desired order */ - copy_table_data(NewHeap, OldHeap, index, verbose, + copy_table_data(NewHeap, OldHeap, index, snapshot, ctx, verbose, &swap_toast_by_content, &frozenXid, &cutoffMulti); + if (concurrent) + { + rebuild_relation_finish_concurrent(NewHeap, OldHeap, index, + cat_state, ctx, + swap_toast_by_content, + frozenXid, cutoffMulti); + + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); + + /* Done with decoding. */ + FreeSnapshot(snapshot); + free_catalog_state(cat_state); + cleanup_logical_decoding(ctx); + ReplicationSlotRelease(); + ReplicationSlotDrop(NameStr(slotname), false); + } + else + { + bool is_system_catalog = IsSystemRelation(OldHeap); - /* Close relcache entries, but keep lock until transaction commit */ - table_close(OldHeap, NoLock); - if (index) - index_close(index, NoLock); + /* Close relcache entries, but keep lock until transaction commit */ + table_close(OldHeap, NoLock); + if (index) + index_close(index, NoLock); - /* - * Close the new relation so it can be dropped as soon as the storage is - * swapped. The relation is not visible to others, so no need to unlock it - * explicitly. - */ - table_close(NewHeap, NoLock); + /* + * Close the new relation so it can be dropped as soon as the storage + * is swapped. The relation is not visible to others, so no need to + * unlock it explicitly. + */ + table_close(NewHeap, NoLock); - /* - * Swap the physical files of the target and transient tables, then - * rebuild the target's indexes and throw away the transient table. - */ - finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, - swap_toast_by_content, false, true, - frozenXid, cutoffMulti, - relpersistence); + /* + * Swap the physical files of the target and transient tables, then + * rebuild the target's indexes and throw away the transient table. + */ + finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, + swap_toast_by_content, false, true, true, + frozenXid, cutoffMulti, + relpersistence); + } } @@ -819,15 +1267,19 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod, /* * Do the physical copying of table data. * + * 'snapshot' and 'decoding_ctx': see table_relation_copy_for_cluster(). Pass + * iff concurrent processing is required. + * * There are three output parameters: * *pSwapToastByContent is set true if toast tables must be swapped by content. * *pFreezeXid receives the TransactionId used as freeze cutoff point. * *pCutoffMulti receives the MultiXactId used as a cutoff point. */ static void -copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verbose, - bool *pSwapToastByContent, TransactionId *pFreezeXid, - MultiXactId *pCutoffMulti) +copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, + Snapshot snapshot, LogicalDecodingContext *decoding_ctx, + bool verbose, bool *pSwapToastByContent, + TransactionId *pFreezeXid, MultiXactId *pCutoffMulti) { Relation relRelation; HeapTuple reltup; @@ -844,6 +1296,7 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb int elevel = verbose ? INFO : DEBUG2; PGRUsage ru0; char *nspname; + bool concurrent = snapshot != NULL; pg_rusage_init(&ru0); @@ -870,8 +1323,12 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb * * We don't need to open the toast relation here, just lock it. The lock * will be held till end of transaction. + * + * In the CONCURRENT case, the lock does not help because we need to + * release it temporarily at some point. Instead, we expect VACUUM / + * CLUSTER to skip tables which are present in ClusteredRelsHash. */ - if (OldHeap->rd_rel->reltoastrelid) + if (OldHeap->rd_rel->reltoastrelid && !concurrent) LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock); /* @@ -947,8 +1404,46 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb * provided, else plain seqscan. */ if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID) + { + ResourceOwner oldowner = CurrentResourceOwner; + + /* + * In the CONCURRENT case, do the planning in a subtransaction so that + * we don't leave any additional locks behind us that we cannot + * release easily. + */ + if (concurrent) + { + Assert(CheckRelationLockedByMe(OldHeap, ShareUpdateExclusiveLock, + false)); + Assert(CheckRelationLockedByMe(OldIndex, ShareUpdateExclusiveLock, + false)); + BeginInternalSubTransaction("plan_cluster_use_sort"); + } + use_sort = plan_cluster_use_sort(RelationGetRelid(OldHeap), RelationGetRelid(OldIndex)); + + if (concurrent) + { + PgBackendProgress progress; + + /* + * Command progress reporting gets terminated at subtransaction + * end. Save the status so it can be eventually restored. + */ + memcpy(&progress, &MyBEEntry->st_progress, + sizeof(PgBackendProgress)); + + /* Release the locks by aborting the subtransaction. */ + RollbackAndReleaseCurrentSubTransaction(); + + /* Restore the progress reporting status. */ + pgstat_progress_restore_state(&progress); + + CurrentResourceOwner = oldowner; + } + } else use_sort = false; @@ -977,7 +1472,9 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb * values (e.g. because the AM doesn't use freezing). */ table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort, - cutoffs.OldestXmin, &cutoffs.FreezeLimit, + cutoffs.OldestXmin, snapshot, + decoding_ctx, + &cutoffs.FreezeLimit, &cutoffs.MultiXactCutoff, &num_tuples, &tups_vacuumed, &tups_recently_dead); @@ -986,7 +1483,11 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb *pFreezeXid = cutoffs.FreezeLimit; *pCutoffMulti = cutoffs.MultiXactCutoff; - /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */ + /* + * Reset rd_toastoid just to be tidy --- it shouldn't be looked at + * again. In the CONCURRENT case, we need to set it again before applying + * the concurrent changes. + */ NewHeap->rd_toastoid = InvalidOid; num_pages = RelationGetNumberOfBlocks(NewHeap); @@ -1439,14 +1940,13 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool swap_toast_by_content, bool check_constraints, bool is_internal, + bool reindex, TransactionId frozenXid, MultiXactId cutoffMulti, char newrelpersistence) { ObjectAddress object; Oid mapped_tables[4]; - int reindex_flags; - ReindexParams reindex_params = {0}; int i; /* Report that we are now swapping relation files */ @@ -1472,39 +1972,46 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, if (is_system_catalog) CacheInvalidateCatalog(OIDOldHeap); - /* - * Rebuild each index on the relation (but not the toast table, which is - * all-new at this point). It is important to do this before the DROP - * step because if we are processing a system catalog that will be used - * during DROP, we want to have its indexes available. There is no - * advantage to the other order anyway because this is all transactional, - * so no chance to reclaim disk space before commit. We do not need a - * final CommandCounterIncrement() because reindex_relation does it. - * - * Note: because index_build is called via reindex_relation, it will never - * set indcheckxmin true for the indexes. This is OK even though in some - * sense we are building new indexes rather than rebuilding existing ones, - * because the new heap won't contain any HOT chains at all, let alone - * broken ones, so it can't be necessary to set indcheckxmin. - */ - reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE; - if (check_constraints) - reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS; + if (reindex) + { + int reindex_flags; + ReindexParams reindex_params = {0}; - /* - * Ensure that the indexes have the same persistence as the parent - * relation. - */ - if (newrelpersistence == RELPERSISTENCE_UNLOGGED) - reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; - else if (newrelpersistence == RELPERSISTENCE_PERMANENT) - reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + /* + * Rebuild each index on the relation (but not the toast table, which + * is all-new at this point). It is important to do this before the + * DROP step because if we are processing a system catalog that will + * be used during DROP, we want to have its indexes available. There + * is no advantage to the other order anyway because this is all + * transactional, so no chance to reclaim disk space before commit. + * We do not need a final CommandCounterIncrement() because + * reindex_relation does it. + * + * Note: because index_build is called via reindex_relation, it will never + * set indcheckxmin true for the indexes. This is OK even though in some + * sense we are building new indexes rather than rebuilding existing ones, + * because the new heap won't contain any HOT chains at all, let alone + * broken ones, so it can't be necessary to set indcheckxmin. + */ + reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE; + if (check_constraints) + reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS; - /* Report that we are now reindexing relations */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); + /* + * Ensure that the indexes have the same persistence as the parent + * relation. + */ + if (newrelpersistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else if (newrelpersistence == RELPERSISTENCE_PERMANENT) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; - reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params); + /* Report that we are now reindexing relations */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); + + reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params); + } /* Report that we are now doing clean up */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, @@ -1744,3 +2251,1886 @@ cluster_is_permitted_for_relation(Oid relid, Oid userid) get_rel_name(relid)))); return false; } + +#define REPL_PLUGIN_NAME "pgoutput_cluster" + +/* + * Each relation being processed by CLUSTER CONCURRENTLY must be in the + * clusteredRels hashtable. + */ +typedef struct ClusteredRel +{ + Oid relid; + Oid dbid; +} ClusteredRel; + +static HTAB *ClusteredRelsHash = NULL; + +/* Maximum number of entries in the hashtable. */ +static int maxClusteredRels = 0; + +Size +ClusterShmemSize(void) +{ + /* + * A replication slot is needed for the processing, so use this GUC to + * allocate memory for the hashtable. Reserve also space for TOAST + * relations. + */ + maxClusteredRels = max_replication_slots * 2; + + return hash_estimate_size(maxClusteredRels, sizeof(ClusteredRel)); +} + +void +ClusterShmemInit(void) +{ + HASHCTL info; + + info.keysize = sizeof(ClusteredRel); + info.entrysize = info.keysize; + + ClusteredRelsHash = ShmemInitHash("Clustered Relations", + maxClusteredRels, + maxClusteredRels, + &info, + HASH_ELEM | HASH_BLOBS); +} + +/* + * Perform a preliminary check whether CLUSTER / VACUUM FULL CONCURRENTLY is + * possible. Note that here we only check things that should not change if we + * release the relation lock temporarily. The information that can change due + * to unlocking is checked in get_catalog_state(). + */ +static void +check_concurrent_cluster_requirements(Relation rel, bool isTopLevel, + bool isCluster) +{ + const char *stmt; + + if (isCluster) + stmt = "CLUSTER (CONCURRENTLY)"; + else + stmt = "VACUUM (FULL, CONCURRENTLY)"; + + /* + * Make sure we have no XID assigned, otherwise call of + * setup_logical_decoding() can cause a deadlock. + */ + PreventInTransactionBlock(isTopLevel, stmt); + + CheckSlotPermissions(); + + /* + * Use an existing function to check if we can use logical + * decoding. However note that RecoveryInProgress() should already have + * caused error, as it does for the non-concurrent VACUUM FULL / CLUSTER. + */ + CheckLogicalDecodingRequirements(); + + /* See ClusterShmemSize() */ + if (max_replication_slots < 2) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + (errmsg("%s requires \"max_replication_slots\" to be at least 2", + stmt))); +} + +/* + * Call this function before CLUSTER CONCURRENTLY starts to setup logical + * decoding. It makes sure that other users of the table put enough + * information into WAL. + * + * The point is that on various places we expect that the table we're + * processing is treated like a system catalog. For example, we need to be + * able to scan it using a "historic snapshot" anytime during the processing + * (as opposed to scanning only at the start point of the decoding, logical + * replication does during initial table synchronization), in order to apply + * concurrent UPDATE / DELETE commands. + * + * Since we need to close and reopen the relation here, the 'rel_p' and + * 'index_p' arguments are in/out. + * + * 'enter_p' receives a bool value telling whether relation OID was entered + * into the hashtable or not. + */ +static void +begin_concurrent_cluster(Relation *rel_p, Relation *index_p, + bool *entered_p) +{ + Relation rel = *rel_p; + Oid relid, toastrelid; + ClusteredRel key, *entry; + bool found; + RelReopenInfo rri[2]; + int nrel; + static bool before_shmem_exit_callback_setup = false; + + relid = RelationGetRelid(rel); + + /* + * Make sure that we do not leave an entry in ClusteredRelsHash if exiting + * due to FATAL. + */ + if (!before_shmem_exit_callback_setup) + { + before_shmem_exit(cluster_before_shmem_exit_callback, 0); + before_shmem_exit_callback_setup = true; + } + + memset(&key, 0, sizeof(key)); + key.relid = relid; + key.dbid = MyDatabaseId; + + *entered_p = false; + LWLockAcquire(ClusteredRelsLock, LW_EXCLUSIVE); + entry = (ClusteredRel *) + hash_search(ClusteredRelsHash, &key, HASH_ENTER_NULL, &found); + if (found) + { + /* + * Since CLUSTER CONCURRENTLY takes ShareRowExclusiveLock, a conflict + * should occur much earlier. However that lock may be released + * temporarily, see below. Anyway, we should complain whatever the + * reason of the conflict might be. + */ + ereport(ERROR, + (errmsg(CLUSTER_IN_PROGRESS_MESSAGE, + RelationGetRelationName(rel)))); + } + if (entry == NULL) + ereport(ERROR, + (errmsg("too many requests for CLUSTER CONCURRENTLY at a time")), + (errhint("Please consider increasing the \"max_replication_slots\" configuration parameter."))); + + /* + * Even if the insertion of TOAST relid should fail below, the caller has + * to do cleanup. + */ + *entered_p = true; + + /* + * Enable the callback to remove the entry in case of exit. We should not + * do this earlier, otherwise an attempt to insert already existing entry + * could make us remove that entry (inserted by another backend) during + * ERROR handling. + */ + Assert(!OidIsValid(clustered_rel)); + clustered_rel = relid; + + /* + * TOAST relation is not accessed using historic snapshot, but we enter it + * here to protect it from being VACUUMed by another backend. (Lock does + * not help in the CONCURRENT case because cannot hold it continuously + * till the end of the transaction.) See the comments on locking TOAST + * relation in copy_table_data(). + */ + toastrelid = rel->rd_rel->reltoastrelid; + if (OidIsValid(toastrelid)) + { + key.relid = toastrelid; + entry = (ClusteredRel *) + hash_search(ClusteredRelsHash, &key, HASH_ENTER_NULL, &found); + if (found) + /* + * If we could enter the main fork the TOAST should succeed + * too. Nevertheless, check. + */ + ereport(ERROR, + (errmsg("TOAST relation of \"%s\" is already being processed by CLUSTER CONCURRENTLY", + RelationGetRelationName(rel)))); + if (entry == NULL) + ereport(ERROR, + (errmsg("too many requests for CLUSTER CONCURRENT at a time")), + (errhint("Please consider increasing the \"max_replication_slots\" configuration parameter."))); + + Assert(!OidIsValid(clustered_rel_toast)); + clustered_rel_toast = toastrelid; + } + LWLockRelease(ClusteredRelsLock); + + /* + * Make sure that other backends are aware of the new hash entry. + * + * Besides sending the invalidation message, we need to force re-opening + * of the relation, which includes the actual invalidation (and thus + * checking of our hashtable on the next access). + */ + CacheInvalidateRelcacheImmediate(rel); + /* + * Since the hashtable only needs to be checked by write transactions, + * lock the relation in a mode that conflicts with any DML command. (The + * reading transactions are supposed to close the relation before opening + * it with higher lock.) Once we have the relation (and its index) locked, + * we unlock it immediately and then re-lock using the original mode. + */ + nrel = 0; + init_rel_reopen_info(&rri[nrel++], rel_p, InvalidOid, + ShareUpdateExclusiveLock, ShareLock); + if (index_p) + { + /* + * Another transaction might want to open both the relation and the + * index. If it already has the relation lock and is waiting for the + * index lock, we should release the index lock, otherwise our request + * for ShareLock on the relation can end up in a deadlock. + */ + init_rel_reopen_info(&rri[nrel++], index_p, InvalidOid, + ShareUpdateExclusiveLock, ShareLock); + } + unlock_and_close_relations(rri, nrel); + /* + * XXX It's not strictly necessary to lock the index here, but it's + * probably not worth teaching the "reopen API" about this special case. + */ + reopen_relations(rri, nrel); + + /* Switch back to the original lock. */ + nrel = 0; + init_rel_reopen_info(&rri[nrel++], rel_p, InvalidOid, + ShareLock, ShareUpdateExclusiveLock); + if (index_p) + init_rel_reopen_info(&rri[nrel++], index_p, InvalidOid, + ShareLock, ShareUpdateExclusiveLock); + unlock_and_close_relations(rri, nrel); + reopen_relations(rri, nrel); + /* Make sure the reopened relcache entry is used, not the old one. */ + rel = *rel_p; + + /* Avoid logical decoding of other relations by this backend. */ + clustered_rel_locator = rel->rd_locator; + if (OidIsValid(toastrelid)) + { + Relation toastrel; + + /* Avoid logical decoding of other TOAST relations. */ + toastrel = table_open(toastrelid, AccessShareLock); + clustered_rel_toast_locator = toastrel->rd_locator; + table_close(toastrel, AccessShareLock); + } +} + +/* + * Call this when done with CLUSTER CONCURRENTLY. + * + * 'error' tells whether the function is being called in order to handle + * error. + */ +static void +end_concurrent_cluster(bool error) +{ + ClusteredRel key; + ClusteredRel *entry = NULL, *entry_toast = NULL; + Oid relid = clustered_rel; + Oid toastrelid = clustered_rel_toast; + + /* Remove the relation from the hash if we managed to insert one. */ + if (OidIsValid(clustered_rel)) + { + memset(&key, 0, sizeof(key)); + key.relid = clustered_rel; + key.dbid = MyDatabaseId; + LWLockAcquire(ClusteredRelsLock, LW_EXCLUSIVE); + entry = hash_search(ClusteredRelsHash, &key, HASH_REMOVE, NULL); + + /* + * By clearing this variable we also disable + * cluster_before_shmem_exit_callback(). + */ + clustered_rel = InvalidOid; + } + + /* Remove the TOAST relation if there is one. */ + if (OidIsValid(clustered_rel_toast)) + { + key.relid = clustered_rel_toast; + entry_toast = hash_search(ClusteredRelsHash, &key, HASH_REMOVE, + NULL); + + clustered_rel_toast = InvalidOid; + } + LWLockRelease(ClusteredRelsLock); + + /* Restore normal function of logical decoding. */ + clustered_rel_locator.relNumber = InvalidOid; + clustered_rel_toast_locator.relNumber = InvalidOid; + + /* + * On normal completion (!error), we should not really fail to remove the + * entry. But if it wasn't there for any reason, raise ERROR to make sure + * the transaction is aborted: if other transactions, while changing the + * contents of the relation, didn't know that CLUSTER CONCURRENTLY was in + * progress, they could have missed to WAL enough information, and thus we + * could have produced an inconsistent table contents. + * + * On the other hand, if we are already handling an error, there's no + * reason to worry about inconsistent contents of the new storage because + * the transaction is going to be rolled back anyway. Furthermore, by + * raising ERROR here we'd shadow the original error. + */ + if (!error) + { + char *relname; + + if (OidIsValid(relid) && entry == NULL) + { + relname = get_rel_name(relid); + if (!relname) + ereport(ERROR, + (errmsg("cache lookup failed for relation %u", + relid))); + + ereport(ERROR, + (errmsg("relation \"%s\" not found among clustered relations", + relname))); + } + + /* + * Likewise, the TOAST relation should not have disappeared. + */ + if (OidIsValid(toastrelid) && entry_toast == NULL) + { + relname = get_rel_name(key.relid); + if (!relname) + ereport(ERROR, + (errmsg("cache lookup failed for relation %u", + key.relid))); + + ereport(ERROR, + (errmsg("relation \"%s\" not found among clustered relations", + relname))); + } + } + + /* + * Note: unlike begin_concurrent_cluster(), here we do not lock/unlock the + * relation: 1) On normal completion, the caller is already holding + * AccessExclusiveLock (till the end of the transaction), 2) on ERROR / + * FATAL, we try to do the cleanup asap, but the worst case is that other + * backends will write unnecessary information to WAL until they close the + * relation. + */ +} + +/* + * A wrapper to call end_concurrent_cluster() as a before_shmem_exit callback. + */ +static void +cluster_before_shmem_exit_callback(int code, Datum arg) +{ + if (OidIsValid(clustered_rel) || OidIsValid(clustered_rel_toast)) + end_concurrent_cluster(true); +} + +/* + * Check if relation is currently being processed by CLUSTER CONCURRENTLY. + */ +bool +is_concurrent_cluster_in_progress(Oid relid) +{ + ClusteredRel key, *entry; + + memset(&key, 0, sizeof(key)); + key.relid = relid; + key.dbid = MyDatabaseId; + + LWLockAcquire(ClusteredRelsLock, LW_SHARED); + entry = (ClusteredRel *) + hash_search(ClusteredRelsHash, &key, HASH_FIND, NULL); + LWLockRelease(ClusteredRelsLock); + + return entry != NULL; +} + +/* + * Check if VACUUM FULL / CLUSTER CONCURRENTLY is already running for given + * relation, and if so, raise ERROR. The problem is that cluster_rel() needs + * to release its lock on the relation temporarily at some point, so our lock + * alone does not help. Commands that might break what cluster_rel() is doing + * should call this function first. + * + * Return without checking if lockmode allows for race conditions which would + * make the result meaningless. In that case, cluster_rel() itself should + * throw ERROR if the relation was changed by us in an incompatible + * way. However, if it managed to do most of its work by then, a lot of CPU + * time might be wasted. + */ +void +check_for_concurrent_cluster(Oid relid, LOCKMODE lockmode) +{ + /* + * If the caller does not have a lock that conflicts with + * ShareUpdateExclusiveLock, the check makes little sense because the + * VACUUM FULL / CLUSTER CONCURRENTLY can start anytime after the check. + */ + if (lockmode < ShareUpdateExclusiveLock) + return; + + if (is_concurrent_cluster_in_progress(relid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg(CLUSTER_IN_PROGRESS_MESSAGE, + get_rel_name(relid)))); + +} + +/* + * Check if relation is eligible for CLUSTER CONCURRENTLY and retrieve the + * catalog state to be passed later to check_catalog_changes. + * + * Caller is supposed to hold (at least) ShareUpdateExclusiveLock on the + * relation. + */ +static CatalogState * +get_catalog_state(Relation rel, bool is_vacuum) +{ + CatalogState *result = palloc_object(CatalogState); + List *ind_oids; + ListCell *lc; + int ninds, i; + char relpersistence = rel->rd_rel->relpersistence; + char replident = rel->rd_rel->relreplident; + Oid ident_idx = RelationGetReplicaIndex(rel); + TupleDesc td_src = RelationGetDescr(rel); + + /* + * While gathering the catalog information, check if there is a reason not + * to proceed. + * + * This function was already called, but the relation was unlocked since + * (see begin_concurrent_cluster()). check_catalog_changes() should catch + * any "disruptive" changes in the future. + */ + check_relation_is_clusterable_concurrently(rel, is_vacuum); + + /* No index should be dropped while we are checking it. */ + Assert(CheckRelationLockedByMe(rel, ShareUpdateExclusiveLock, true)); + + ind_oids = RelationGetIndexList(rel); + result->ninds = ninds = list_length(ind_oids); + result->ind_oids = palloc_array(Oid, ninds); + result->ind_tupdescs = palloc_array(TupleDesc, ninds); + i = 0; + foreach(lc, ind_oids) + { + Oid ind_oid = lfirst_oid(lc); + Relation index; + TupleDesc td_ind_src, td_ind_dst; + + /* + * Weaker lock should be o.k. for the index, but this one should not + * break anything either. + */ + index = index_open(ind_oid, ShareUpdateExclusiveLock); + + result->ind_oids[i] = RelationGetRelid(index); + td_ind_src = RelationGetDescr(index); + td_ind_dst = palloc(TupleDescSize(td_ind_src)); + TupleDescCopy(td_ind_dst, td_ind_src); + result->ind_tupdescs[i] = td_ind_dst; + i++; + + index_close(index, ShareUpdateExclusiveLock); + } + + /* Fill-in the relation info. */ + result->tupdesc = palloc(TupleDescSize(td_src)); + TupleDescCopy(result->tupdesc, td_src); + result->relpersistence = relpersistence; + result->replident = replident; + result->replidindex = ident_idx; + + return result; +} + +static void +free_catalog_state(CatalogState *state) +{ + /* We are only interested in indexes. */ + if (state->ninds == 0) + return; + + for (int i = 0; i < state->ninds; i++) + FreeTupleDesc(state->ind_tupdescs[i]); + + FreeTupleDesc(state->tupdesc); + pfree(state->ind_oids); + pfree(state->ind_tupdescs); + pfree(state); +} + +/* + * Raise ERROR if 'rel' changed in a way that does not allow further + * processing of CLUSTER CONCURRENTLY. + * + * Besides the relation's tuple descriptor, it's important to check indexes: + * concurrent change of index definition (can it happen in other way than + * dropping and re-creating the index, accidentally with the same OID?) can be + * a problem because we may already have the new index built. If an index was + * created or dropped concurrently, we'd fail to swap the index storage. In + * any case, we prefer to check the indexes early to get an explicit error + * message about the mismatch. Furthermore, the earlier we detect the change, + * the fewer CPU cycles we waste. + * + * Note that we do not check constraints because the transaction which changed + * them must have ensured that the existing tuples satisfy the new + * constraints. If any DML commands were necessary for that, we will simply + * decode them from WAL and apply them to the new storage. + * + * Caller is supposed to hold (at least) ShareUpdateExclusiveLock on the + * relation. + */ +static void +check_catalog_changes(Relation rel, CatalogState *cat_state) +{ + Oid reltoastrelid = rel->rd_rel->reltoastrelid; + List *ind_oids; + ListCell *lc; + LOCKMODE lockmode; + Oid ident_idx; + TupleDesc td, td_cp; + + /* First, check the relation info. */ + + /* TOAST is not easy to change, but check. */ + if (reltoastrelid != clustered_rel_toast) + ereport(ERROR, + errmsg("TOAST relation of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel))); + + /* + * Likewise, check_for_concurrent_cluster() should prevent others from + * changing the relation file concurrently, but it's our responsibility to + * avoid data loss. (The original locators are stored outside cat_state, + * but the check belongs to this function.) + */ + if (!RelFileLocatorEquals(rel->rd_locator, clustered_rel_locator)) + ereport(ERROR, + (errmsg("file of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel)))); + if (OidIsValid(reltoastrelid)) + { + Relation toastrel; + + toastrel = table_open(reltoastrelid, AccessShareLock); + if (!RelFileLocatorEquals(toastrel->rd_locator, + clustered_rel_toast_locator)) + ereport(ERROR, + (errmsg("file of relation \"%s\" changed by another transaction", + RelationGetRelationName(toastrel)))); + table_close(toastrel, AccessShareLock); + } + + if (rel->rd_rel->relpersistence != cat_state->relpersistence) + ereport(ERROR, + errmsg("persistence of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel))); + + if (cat_state->replident != rel->rd_rel->relreplident) + ereport(ERROR, + errmsg("replica identity of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel))); + + ident_idx = RelationGetReplicaIndex(rel); + if (ident_idx == InvalidOid && rel->rd_pkindex != InvalidOid) + ident_idx = rel->rd_pkindex; + if (cat_state->replidindex != ident_idx) + ereport(ERROR, + errmsg("identity index of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel))); + + /* + * As cat_state contains a copy (which has the constraint info cleared), + * create a temporary copy for the comparison. + */ + td = RelationGetDescr(rel); + td_cp = palloc(TupleDescSize(td)); + TupleDescCopy(td_cp, td); + if (!equalTupleDescs(cat_state->tupdesc, td_cp)) + ereport(ERROR, + errmsg("definition of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel))); + FreeTupleDesc(td_cp); + + /* Now we are only interested in indexes. */ + if (cat_state->ninds == 0) + return; + + /* No index should be dropped while we are checking the relation. */ + lockmode = ShareUpdateExclusiveLock; + Assert(CheckRelationLockedByMe(rel, lockmode, true)); + + ind_oids = RelationGetIndexList(rel); + if (list_length(ind_oids) != cat_state->ninds) + goto failed_index; + + foreach(lc, ind_oids) + { + Oid ind_oid = lfirst_oid(lc); + int i; + TupleDesc tupdesc; + Relation index; + + /* Find the index in cat_state. */ + for (i = 0; i < cat_state->ninds; i++) + { + if (cat_state->ind_oids[i] == ind_oid) + break; + } + /* + * OID not found, i.e. the index was replaced by another one. XXX + * Should we yet try to find if an index having the desired tuple + * descriptor exists? Or should we always look for the tuple + * descriptor and not use OIDs at all? + */ + if (i == cat_state->ninds) + goto failed_index; + + /* Check the tuple descriptor. */ + index = try_index_open(ind_oid, lockmode); + if (index == NULL) + goto failed_index; + tupdesc = RelationGetDescr(index); + if (!equalTupleDescs(cat_state->ind_tupdescs[i], tupdesc)) + goto failed_index; + index_close(index, lockmode); + } + + return; + +failed_index: + ereport(ERROR, + (errmsg("index(es) of relation \"%s\" changed by another transaction", + RelationGetRelationName(rel)))); +} + +/* + * This function is much like pg_create_logical_replication_slot() except that + * the new slot is neither released (if anyone else could read changes from + * our slot, we could miss changes other backends do while we copy the + * existing data into temporary table), nor persisted (it's easier to handle + * crash by restarting all the work from scratch). + * + * XXX Even though CreateInitDecodingContext() does not set state to + * RS_PERSISTENT, it does write the slot to disk. We rely on + * RestoreSlotFromDisk() to delete ephemeral slots during startup. (Both ERROR + * and FATAL should lead to cleanup even before the cluster goes down.) + */ +static LogicalDecodingContext * +setup_logical_decoding(Oid relid, const char *slotname, TupleDesc tupdesc) +{ + LogicalDecodingContext *ctx; + ClusterDecodingState *dstate; + + /* RS_TEMPORARY so that the slot gets cleaned up on ERROR. */ + ReplicationSlotCreate(slotname, true, RS_TEMPORARY, false, false, false); + + /* + * Neither prepare_write nor do_write callback nor update_progress is + * useful for us. + * + * Regarding the value of need_full_snapshot, we pass false because the + * table we are processing is present in ClusteredRelsHash and therefore, + * regarding logical decoding, treated like a catalog. + */ + ctx = CreateInitDecodingContext(REPL_PLUGIN_NAME, + NIL, + false, + InvalidXLogRecPtr, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), + NULL, NULL, NULL); + + /* + * We don't have control on setting fast_forward, so at least check it. + */ + Assert(!ctx->fast_forward); + + DecodingContextFindStartpoint(ctx); + + /* Some WAL records should have been read. */ + Assert(ctx->reader->EndRecPtr != InvalidXLogRecPtr); + + XLByteToSeg(ctx->reader->EndRecPtr, cluster_current_segment, + wal_segment_size); + + /* + * Setup structures to store decoded changes. + */ + dstate = palloc0(sizeof(ClusterDecodingState)); + dstate->relid = relid; + dstate->tstore = tuplestore_begin_heap(false, false, + maintenance_work_mem); + dstate->tupdesc = tupdesc; + + /* Initialize the descriptor to store the changes ... */ + dstate->tupdesc_change = CreateTemplateTupleDesc(1); + + TupleDescInitEntry(dstate->tupdesc_change, 1, NULL, BYTEAOID, -1, 0); + /* ... as well as the corresponding slot. */ + dstate->tsslot = MakeSingleTupleTableSlot(dstate->tupdesc_change, + &TTSOpsMinimalTuple); + + dstate->resowner = ResourceOwnerCreate(CurrentResourceOwner, + "logical decoding"); + + ctx->output_writer_private = dstate; + return ctx; +} + +/* + * Retrieve tuple from ConcurrentChange structure. + * + * The input data starts with the structure but it might not be appropriately + * aligned. + */ +static HeapTuple +get_changed_tuple(char *change) +{ + HeapTupleData tup_data; + HeapTuple result; + char *src; + + /* + * Ensure alignment before accessing the fields. (This is why we can't use + * heap_copytuple() instead of this function.) + */ + src = change + offsetof(ConcurrentChange, tup_data); + memcpy(&tup_data, src, sizeof(HeapTupleData)); + + result = (HeapTuple) palloc(HEAPTUPLESIZE + tup_data.t_len); + memcpy(result, &tup_data, sizeof(HeapTupleData)); + result->t_data = (HeapTupleHeader) ((char *) result + HEAPTUPLESIZE); + src = change + SizeOfConcurrentChange; + memcpy(result->t_data, src, result->t_len); + + return result; +} + +/* + * Decode logical changes from the WAL sequence up to end_of_wal. + */ +void +cluster_decode_concurrent_changes(LogicalDecodingContext *ctx, + XLogRecPtr end_of_wal) +{ + ClusterDecodingState *dstate; + ResourceOwner resowner_old; + PgBackendProgress progress; + + /* + * Invalidate the "present" cache before moving to "(recent) history". + */ + InvalidateSystemCaches(); + + dstate = (ClusterDecodingState *) ctx->output_writer_private; + resowner_old = CurrentResourceOwner; + CurrentResourceOwner = dstate->resowner; + + /* + * reorderbuffer.c uses internal subtransaction, whose abort ends the + * command progress reporting. Save the status here so we can restore when + * done with the decoding. + */ + memcpy(&progress, &MyBEEntry->st_progress, sizeof(PgBackendProgress)); + + PG_TRY(); + { + while (ctx->reader->EndRecPtr < end_of_wal) + { + XLogRecord *record; + XLogSegNo segno_new; + char *errm = NULL; + XLogRecPtr end_lsn; + + record = XLogReadRecord(ctx->reader, &errm); + if (errm) + elog(ERROR, "%s", errm); + + if (record != NULL) + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* + * If WAL segment boundary has been crossed, inform the decoding + * system that the catalog_xmin can advance. (We can confirm more + * often, but a filling a single WAL segment should not take much + * time.) + */ + end_lsn = ctx->reader->EndRecPtr; + XLByteToSeg(end_lsn, segno_new, wal_segment_size); + if (segno_new != cluster_current_segment) + { + LogicalConfirmReceivedLocation(end_lsn); + elog(DEBUG1, "cluster: confirmed receive location %X/%X", + (uint32) (end_lsn >> 32), (uint32) end_lsn); + cluster_current_segment = segno_new; + } + + CHECK_FOR_INTERRUPTS(); + } + InvalidateSystemCaches(); + CurrentResourceOwner = resowner_old; + } + PG_CATCH(); + { + InvalidateSystemCaches(); + CurrentResourceOwner = resowner_old; + PG_RE_THROW(); + } + PG_END_TRY(); + + /* Restore the progress reporting status. */ + pgstat_progress_restore_state(&progress); +} + +/* + * Apply changes that happened during the initial load. + * + * Scan key is passed by caller, so it does not have to be constructed + * multiple times. Key entries have all fields initialized, except for + * sk_argument. + */ +static void +apply_concurrent_changes(ClusterDecodingState *dstate, Relation rel, + ScanKey key, int nkeys, IndexInsertState *iistate) +{ + TupleTableSlot *index_slot, *ident_slot; + HeapTuple tup_old = NULL; + + if (dstate->nchanges == 0) + return; + + /* TupleTableSlot is needed to pass the tuple to ExecInsertIndexTuples(). */ + index_slot = MakeSingleTupleTableSlot(dstate->tupdesc, &TTSOpsHeapTuple); + iistate->econtext->ecxt_scantuple = index_slot; + + /* A slot to fetch tuples from identity index. */ + ident_slot = table_slot_create(rel, NULL); + + while (tuplestore_gettupleslot(dstate->tstore, true, false, + dstate->tsslot)) + { + bool shouldFree; + HeapTuple tup_change, + tup, + tup_exist; + char *change_raw, *src; + ConcurrentChange change; + bool isnull[1]; + Datum values[1]; + + CHECK_FOR_INTERRUPTS(); + + /* Get the change from the single-column tuple. */ + tup_change = ExecFetchSlotHeapTuple(dstate->tsslot, false, &shouldFree); + heap_deform_tuple(tup_change, dstate->tupdesc_change, values, isnull); + Assert(!isnull[0]); + + /* Make sure we access aligned data. */ + change_raw = (char *) DatumGetByteaP(values[0]); + src = (char *) VARDATA(change_raw); + memcpy(&change, src, SizeOfConcurrentChange); + + /* TRUNCATE change contains no tuple, so process it separately. */ + if (change.kind == CHANGE_TRUNCATE) + { + /* + * All the things that ExecuteTruncateGuts() does (such as firing + * triggers or handling the DROP_CASCADE behavior) should have + * taken place on the source relation. Thus we only do the actual + * truncation of the new relation (and its indexes). + */ + heap_truncate_one_rel(rel); + + pfree(tup_change); + continue; + } + + /* + * Extract the tuple from the change. The tuple is copied here because + * it might be assigned to 'tup_old', in which case it needs to + * survive into the next iteration. + */ + tup = get_changed_tuple(src); + + if (change.kind == CHANGE_UPDATE_OLD) + { + Assert(tup_old == NULL); + tup_old = tup; + } + else if (change.kind == CHANGE_INSERT) + { + Assert(tup_old == NULL); + + apply_concurrent_insert(rel, &change, tup, iistate, index_slot); + + pfree(tup); + } + else if (change.kind == CHANGE_UPDATE_NEW || + change.kind == CHANGE_DELETE) + { + IndexScanDesc ind_scan = NULL; + HeapTuple tup_key; + + if (change.kind == CHANGE_UPDATE_NEW) + { + tup_key = tup_old != NULL ? tup_old : tup; + } + else + { + Assert(tup_old == NULL); + tup_key = tup; + } + + /* + * Find the tuple to be updated or deleted. + */ + tup_exist = find_target_tuple(rel, key, nkeys, tup_key, + iistate, ident_slot, &ind_scan); + if (tup_exist == NULL) + elog(ERROR, "Failed to find target tuple"); + + if (change.kind == CHANGE_UPDATE_NEW) + apply_concurrent_update(rel, tup, tup_exist, &change, iistate, + index_slot); + else + apply_concurrent_delete(rel, tup_exist, &change); + + if (tup_old != NULL) + { + pfree(tup_old); + tup_old = NULL; + } + + pfree(tup); + index_endscan(ind_scan); + } + else + elog(ERROR, "Unrecognized kind of change: %d", change.kind); + + /* If there's any change, make it visible to the next iteration. */ + if (change.kind != CHANGE_UPDATE_OLD) + { + CommandCounterIncrement(); + UpdateActiveSnapshotCommandId(); + } + + /* TTSOpsMinimalTuple has .get_heap_tuple==NULL. */ + Assert(shouldFree); + pfree(tup_change); + } + + tuplestore_clear(dstate->tstore); + dstate->nchanges = 0; + + /* Cleanup. */ + ExecDropSingleTupleTableSlot(index_slot); + ExecDropSingleTupleTableSlot(ident_slot); +} + +static void +apply_concurrent_insert(Relation rel, ConcurrentChange *change, HeapTuple tup, + IndexInsertState *iistate, TupleTableSlot *index_slot) +{ + List *recheck; + + + heap_insert(rel, tup, GetCurrentCommandId(true), HEAP_INSERT_NO_LOGICAL, NULL); + + /* + * Update indexes. + * + * In case functions in the index need the active snapshot and caller + * hasn't set one. + */ + ExecStoreHeapTuple(tup, index_slot, false); + recheck = ExecInsertIndexTuples(iistate->rri, + index_slot, + iistate->estate, + false, /* update */ + false, /* noDupErr */ + NULL, /* specConflict */ + NIL, /* arbiterIndexes */ + false /* onlySummarizing */ + ); + + /* + * If recheck is required, it must have been preformed on the source + * relation by now. (All the logical changes we process here are already + * committed.) + */ + list_free(recheck); + + pgstat_progress_incr_param(PROGRESS_CLUSTER_HEAP_TUPLES_INSERTED, 1); +} + +static void +apply_concurrent_update(Relation rel, HeapTuple tup, HeapTuple tup_target, + ConcurrentChange *change, IndexInsertState *iistate, + TupleTableSlot *index_slot) +{ + List *recheck; + TU_UpdateIndexes update_indexes; + + /* + * Write the new tuple into the new heap. ('tup' gets the TID assigned + * here.) + */ + simple_heap_update(rel, &tup_target->t_self, tup, &update_indexes); + + ExecStoreHeapTuple(tup, index_slot, false); + + if (update_indexes != TU_None) + { + recheck = ExecInsertIndexTuples(iistate->rri, + index_slot, + iistate->estate, + true, /* update */ + false, /* noDupErr */ + NULL, /* specConflict */ + NIL, /* arbiterIndexes */ + /* onlySummarizing */ + update_indexes == TU_Summarizing); + list_free(recheck); + } + + pgstat_progress_incr_param(PROGRESS_CLUSTER_HEAP_TUPLES_UPDATED, 1); +} + +static void +apply_concurrent_delete(Relation rel, HeapTuple tup_target, + ConcurrentChange *change) +{ + simple_heap_delete(rel, &tup_target->t_self); + + pgstat_progress_incr_param(PROGRESS_CLUSTER_HEAP_TUPLES_DELETED, 1); +} + +/* + * Find the tuple to be updated or deleted. + * + * 'key' is a pre-initialized scan key, into which the function will put the + * key values. + * + * 'tup_key' is a tuple containing the key values for the scan. + * + * On exit,'*scan_p' contains the scan descriptor used. The caller must close + * it when he no longer needs the tuple returned. + */ +static HeapTuple +find_target_tuple(Relation rel, ScanKey key, int nkeys, HeapTuple tup_key, + IndexInsertState *iistate, + TupleTableSlot *ident_slot, IndexScanDesc *scan_p) +{ + IndexScanDesc scan; + Form_pg_index ident_form; + int2vector *ident_indkey; + HeapTuple result = NULL; + + scan = index_beginscan(rel, iistate->ident_index, GetActiveSnapshot(), + nkeys, 0); + *scan_p = scan; + index_rescan(scan, key, nkeys, NULL, 0); + + /* Info needed to retrieve key values from heap tuple. */ + ident_form = iistate->ident_index->rd_index; + ident_indkey = &ident_form->indkey; + + /* Use the incoming tuple to finalize the scan key. */ + for (int i = 0; i < scan->numberOfKeys; i++) + { + ScanKey entry; + bool isnull; + int16 attno_heap; + + entry = &scan->keyData[i]; + attno_heap = ident_indkey->values[i]; + entry->sk_argument = heap_getattr(tup_key, + attno_heap, + rel->rd_att, + &isnull); + Assert(!isnull); + } + if (index_getnext_slot(scan, ForwardScanDirection, ident_slot)) + { + bool shouldFree; + + result = ExecFetchSlotHeapTuple(ident_slot, false, &shouldFree); + /* TTSOpsBufferHeapTuple has .get_heap_tuple != NULL. */ + Assert(!shouldFree); + } + + return result; +} + +/* + * Decode and apply concurrent changes. + * + * Pass rel_src iff its reltoastrelid is needed. + */ +static void +process_concurrent_changes(LogicalDecodingContext *ctx, XLogRecPtr end_of_wal, + Relation rel_dst, Relation rel_src, ScanKey ident_key, + int ident_key_nentries, IndexInsertState *iistate) +{ + ClusterDecodingState *dstate; + + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_CATCH_UP); + + dstate = (ClusterDecodingState *) ctx->output_writer_private; + + cluster_decode_concurrent_changes(ctx, end_of_wal); + + if (dstate->nchanges == 0) + return; + + PG_TRY(); + { + /* + * Make sure that TOAST values can eventually be accessed via the old + * relation - see comment in copy_table_data(). + */ + if (rel_src) + rel_dst->rd_toastoid = rel_src->rd_rel->reltoastrelid; + + apply_concurrent_changes(dstate, rel_dst, ident_key, + ident_key_nentries, iistate); + } + PG_FINALLY(); + { + if (rel_src) + rel_dst->rd_toastoid = InvalidOid; + } + PG_END_TRY(); +} + +static IndexInsertState * +get_index_insert_state(Relation relation, Oid ident_index_id) +{ + EState *estate; + int i; + IndexInsertState *result; + + result = (IndexInsertState *) palloc0(sizeof(IndexInsertState)); + estate = CreateExecutorState(); + result->econtext = GetPerTupleExprContext(estate); + + result->rri = (ResultRelInfo *) palloc(sizeof(ResultRelInfo)); + InitResultRelInfo(result->rri, relation, 0, 0, 0); + ExecOpenIndices(result->rri, false); + + /* + * Find the relcache entry of the identity index so that we spend no extra + * effort to open / close it. + */ + for (i = 0; i < result->rri->ri_NumIndices; i++) + { + Relation ind_rel; + + ind_rel = result->rri->ri_IndexRelationDescs[i]; + if (ind_rel->rd_id == ident_index_id) + result->ident_index = ind_rel; + } + if (result->ident_index == NULL) + elog(ERROR, "Failed to open identity index"); + + /* Only initialize fields needed by ExecInsertIndexTuples(). */ + result->estate = estate; + + return result; +} + +/* + * Build scan key to process logical changes. + */ +static ScanKey +build_identity_key(Oid ident_idx_oid, Relation rel_src, int *nentries) +{ + Relation ident_idx_rel; + Form_pg_index ident_idx; + int n, + i; + ScanKey result; + + Assert(OidIsValid(ident_idx_oid)); + ident_idx_rel = index_open(ident_idx_oid, AccessShareLock); + ident_idx = ident_idx_rel->rd_index; + n = ident_idx->indnatts; + result = (ScanKey) palloc(sizeof(ScanKeyData) * n); + for (i = 0; i < n; i++) + { + ScanKey entry; + int16 relattno; + Form_pg_attribute att; + Oid opfamily, + opcintype, + opno, + opcode; + + entry = &result[i]; + relattno = ident_idx->indkey.values[i]; + if (relattno >= 1) + { + TupleDesc desc; + + desc = rel_src->rd_att; + att = TupleDescAttr(desc, relattno - 1); + } + else + elog(ERROR, "Unexpected attribute number %d in index", relattno); + + opfamily = ident_idx_rel->rd_opfamily[i]; + opcintype = ident_idx_rel->rd_opcintype[i]; + opno = get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + + if (!OidIsValid(opno)) + elog(ERROR, "Failed to find = operator for type %u", opcintype); + + opcode = get_opcode(opno); + if (!OidIsValid(opcode)) + elog(ERROR, "Failed to find = operator for operator %u", opno); + + /* Initialize everything but argument. */ + ScanKeyInit(entry, + i + 1, + BTEqualStrategyNumber, opcode, + (Datum) NULL); + entry->sk_collation = att->attcollation; + } + index_close(ident_idx_rel, AccessShareLock); + + *nentries = n; + return result; +} + +static void +free_index_insert_state(IndexInsertState *iistate) +{ + ExecCloseIndices(iistate->rri); + FreeExecutorState(iistate->estate); + pfree(iistate->rri); + pfree(iistate); +} + +static void +cleanup_logical_decoding(LogicalDecodingContext *ctx) +{ + ClusterDecodingState *dstate; + + dstate = (ClusterDecodingState *) ctx->output_writer_private; + + ExecDropSingleTupleTableSlot(dstate->tsslot); + FreeTupleDesc(dstate->tupdesc_change); + FreeTupleDesc(dstate->tupdesc); + tuplestore_end(dstate->tstore); + + FreeDecodingContext(ctx); +} + +/* + * The final steps of rebuild_relation() for concurrent processing. + * + * On entry, NewHeap is locked in AccessExclusiveLock mode. OldHeap and its + * clustering index (if one is passed) are still locked in a mode that allows + * concurrent data changes. On exit, both tables and their indexes are closed, + * but locked in AccessExclusiveLock mode. + */ +static void +rebuild_relation_finish_concurrent(Relation NewHeap, Relation OldHeap, + Relation cl_index, + CatalogState *cat_state, + LogicalDecodingContext *ctx, + bool swap_toast_by_content, + TransactionId frozenXid, + MultiXactId cutoffMulti) +{ + LOCKMODE lockmode_old PG_USED_FOR_ASSERTS_ONLY; + List *ind_oids_new; + Oid old_table_oid = RelationGetRelid(OldHeap); + Oid new_table_oid = RelationGetRelid(NewHeap); + List *ind_oids_old = RelationGetIndexList(OldHeap); + ListCell *lc, *lc2; + char relpersistence; + bool is_system_catalog; + Oid ident_idx_old, ident_idx_new; + IndexInsertState *iistate; + ScanKey ident_key; + int ident_key_nentries; + XLogRecPtr wal_insert_ptr, end_of_wal; + char dummy_rec_data = '\0'; + RelReopenInfo *rri = NULL; + int nrel; + Relation *ind_refs_all, *ind_refs_p; + + /* Like in cluster_rel(). */ + lockmode_old = ShareUpdateExclusiveLock; + Assert(CheckRelationLockedByMe(OldHeap, lockmode_old, false)); + Assert(cl_index == NULL || + CheckRelationLockedByMe(cl_index, lockmode_old, false)); + /* This is expected from the caller. */ + Assert(CheckRelationLockedByMe(NewHeap, AccessExclusiveLock, false)); + + ident_idx_old = RelationGetReplicaIndex(OldHeap); + + /* + * Unlike the exclusive case, we build new indexes for the new relation + * rather than swapping the storage and reindexing the old relation. The + * point is that the index build can take some time, so we do it before we + * get AccessExclusiveLock on the old heap and therefore we cannot swap + * the heap storage yet. + * + * index_create() will lock the new indexes using AccessExclusiveLock + * creation - no need to change that. + */ + ind_oids_new = build_new_indexes(NewHeap, OldHeap, ind_oids_old); + + /* + * Processing shouldn't start w/o valid identity index. + */ + Assert(OidIsValid(ident_idx_old)); + + /* Find "identity index" on the new relation. */ + ident_idx_new = InvalidOid; + forboth(lc, ind_oids_old, lc2, ind_oids_new) + { + Oid ind_old = lfirst_oid(lc); + Oid ind_new = lfirst_oid(lc2); + + if (ident_idx_old == ind_old) + { + ident_idx_new = ind_new; + break; + } + } + if (!OidIsValid(ident_idx_new)) + /* + * Should not happen, given our lock on the old relation. + */ + ereport(ERROR, + (errmsg("Identity index missing on the new relation"))); + + /* Executor state to update indexes. */ + iistate = get_index_insert_state(NewHeap, ident_idx_new); + + /* + * Build scan key that we'll use to look for rows to be updated / deleted + * during logical decoding. + */ + ident_key = build_identity_key(ident_idx_new, OldHeap, &ident_key_nentries); + + /* + * Flush all WAL records inserted so far (possibly except for the last + * incomplete page, see GetInsertRecPtr), to minimize the amount of data + * we need to flush while holding exclusive lock on the source table. + */ + wal_insert_ptr = GetInsertRecPtr(); + XLogFlush(wal_insert_ptr); + end_of_wal = GetFlushRecPtr(NULL); + + /* + * Apply concurrent changes first time, to minimize the time we need to + * hold AccessExclusiveLock. (Quite some amount of WAL could have been + * written during the data copying and index creation.) + */ + process_concurrent_changes(ctx, end_of_wal, NewHeap, + swap_toast_by_content ? OldHeap : NULL, + ident_key, ident_key_nentries, iistate); + + /* + * Release the locks that allowed concurrent data changes, in order to + * acquire the AccessExclusiveLock. + */ + nrel = 0; + /* + * We unlock the old relation (and its clustering index), but then we will + * lock the relation and *all* its indexes because we want to swap their + * storage. + * + * (NewHeap is already locked, as well as its indexes.) + */ + rri = palloc_array(RelReopenInfo, 1 + list_length(ind_oids_old)); + init_rel_reopen_info(&rri[nrel++], &OldHeap, InvalidOid, + ShareUpdateExclusiveLock, AccessExclusiveLock); + /* References to the re-opened indexes will be stored in this array. */ + ind_refs_all = palloc_array(Relation, list_length(ind_oids_old)); + ind_refs_p = ind_refs_all; + /* The clustering index is a special case. */ + if (cl_index) + { + *ind_refs_p = cl_index; + init_rel_reopen_info(&rri[nrel], ind_refs_p, InvalidOid, + ShareUpdateExclusiveLock, AccessExclusiveLock); + nrel++; + ind_refs_p++; + } + /* + * Initialize also the entries for the other indexes (currently unlocked) + * because we will have to lock them. + */ + foreach(lc, ind_oids_old) + { + Oid ind_oid; + + ind_oid = lfirst_oid(lc); + /* Clustering index is already in the array, or there is none. */ + if (cl_index && RelationGetRelid(cl_index) == ind_oid) + continue; + + Assert(nrel < (1 + list_length(ind_oids_old))); + + *ind_refs_p = NULL; + init_rel_reopen_info(&rri[nrel], + /* + * In this special case we do not have the + * relcache reference, use OID instead. + */ + ind_refs_p, + ind_oid, + NoLock, /* Nothing to unlock. */ + AccessExclusiveLock); + + nrel++; + ind_refs_p++; + } + /* Perform the actual unlocking and re-locking. */ + unlock_and_close_relations(rri, nrel); + reopen_relations(rri, nrel); + + /* + * In addition, lock the OldHeap's TOAST relation that we skipped for the + * CONCURRENTLY option in copy_table_data(). This lock will be needed to + * swap the relation files. + */ + if (OidIsValid(OldHeap->rd_rel->reltoastrelid)) + LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock); + + /* + * Check if the new indexes match the old ones, i.e. no changes occurred + * while OldHeap was unlocked. + * + * XXX It's probably not necessary to check the relation tuple descriptor + * here because the logical decoding was already active when we released + * the lock, and thus the corresponding data changes won't be lost. + * However processing of those changes might take a lot of time. + */ + check_catalog_changes(OldHeap, cat_state); + + /* + * Tuples and pages of the old heap will be gone, but the heap will stay. + */ + TransferPredicateLocksToHeapRelation(OldHeap); + /* The same for indexes. */ + for (int i = 0; i < (nrel - 1); i++) + { + Relation index = ind_refs_all[i]; + + TransferPredicateLocksToHeapRelation(index); + + /* + * References to indexes on the old relation are not needed anymore, + * however locks stay till the end of the transaction. + */ + index_close(index, NoLock); + } + pfree(ind_refs_all); + + /* + * Flush anything we see in WAL, to make sure that all changes committed + * while we were waiting for the exclusive lock are available for + * decoding. This should not be necessary if all backends had + * synchronous_commit set, but we can't rely on this setting. + * + * Unfortunately, GetInsertRecPtr() may lag behind the actual insert + * position, and GetLastImportantRecPtr() points at the start of the last + * record rather than at the end. Thus the simplest way to determine the + * insert position is to insert a dummy record and use its LSN. + * + * XXX Consider using GetLastImportantRecPtr() and adding the size of the + * last record (plus the total size of all the page headers the record + * spans)? + */ + XLogBeginInsert(); + XLogRegisterData(&dummy_rec_data, 1); + wal_insert_ptr = XLogInsert(RM_XLOG_ID, XLOG_NOOP); + XLogFlush(wal_insert_ptr); + end_of_wal = GetFlushRecPtr(NULL); + + /* Apply the concurrent changes again. */ + process_concurrent_changes(ctx, end_of_wal, NewHeap, + swap_toast_by_content ? OldHeap : NULL, + ident_key, ident_key_nentries, iistate); + + /* Remember info about rel before closing OldHeap */ + relpersistence = OldHeap->rd_rel->relpersistence; + is_system_catalog = IsSystemRelation(OldHeap); + + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES); + + forboth(lc, ind_oids_old, lc2, ind_oids_new) + { + Oid ind_old = lfirst_oid(lc); + Oid ind_new = lfirst_oid(lc2); + Oid mapped_tables[4]; + + /* Zero out possible results from swapped_relation_files */ + memset(mapped_tables, 0, sizeof(mapped_tables)); + + swap_relation_files(ind_old, ind_new, + (old_table_oid == RelationRelationId), + swap_toast_by_content, + true, + InvalidTransactionId, + InvalidMultiXactId, + mapped_tables); + +#ifdef USE_ASSERT_CHECKING + /* + * Concurrent processing is not supported for system relations, so + * there should be no mapped tables. + */ + for (int i = 0; i < 4; i++) + Assert(mapped_tables[i] == 0); +#endif + } + + /* The new indexes must be visible for deletion. */ + CommandCounterIncrement(); + + /* Close the old heap but keep lock until transaction commit. */ + table_close(OldHeap, NoLock); + /* Close the new heap. (We didn't have to open its indexes). */ + table_close(NewHeap, NoLock); + + /* Cleanup what we don't need anymore. (And close the identity index.) */ + pfree(ident_key); + free_index_insert_state(iistate); + + /* + * Swap the relations and their TOAST relations and TOAST indexes. This + * also drops the new relation and its indexes. + * + * (System catalogs are currently not supported.) + */ + Assert(!is_system_catalog); + finish_heap_swap(old_table_oid, new_table_oid, + is_system_catalog, + swap_toast_by_content, + false, true, false, + frozenXid, cutoffMulti, + relpersistence); + + pfree(rri); +} + +/* + * Build indexes on NewHeap according to those on OldHeap. + * + * OldIndexes is the list of index OIDs on OldHeap. + * + * A list of OIDs of the corresponding indexes created on NewHeap is + * returned. The order of items does match, so we can use these arrays to swap + * index storage. + */ +static List * +build_new_indexes(Relation NewHeap, Relation OldHeap, List *OldIndexes) +{ + StringInfo ind_name; + ListCell *lc; + List *result = NIL; + + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); + + ind_name = makeStringInfo(); + + foreach(lc, OldIndexes) + { + Oid ind_oid, + ind_oid_new, + tbsp_oid; + Relation ind; + IndexInfo *ind_info; + int i, + heap_col_id; + List *colnames; + int16 indnatts; + Oid *collations, + *opclasses; + HeapTuple tup; + bool isnull; + Datum d; + oidvector *oidvec; + int2vector *int2vec; + size_t oid_arr_size; + size_t int2_arr_size; + int16 *indoptions; + text *reloptions = NULL; + bits16 flags; + Datum *opclassOptions; + NullableDatum *stattargets; + + ind_oid = lfirst_oid(lc); + ind = index_open(ind_oid, AccessShareLock); + ind_info = BuildIndexInfo(ind); + + tbsp_oid = ind->rd_rel->reltablespace; + /* + * Index name really doesn't matter, we'll eventually use only their + * storage. Just make them unique within the table. + */ + resetStringInfo(ind_name); + appendStringInfo(ind_name, "ind_%d", + list_cell_number(OldIndexes, lc)); + + flags = 0; + if (ind->rd_index->indisprimary) + flags |= INDEX_CREATE_IS_PRIMARY; + + colnames = NIL; + indnatts = ind->rd_index->indnatts; + oid_arr_size = sizeof(Oid) * indnatts; + int2_arr_size = sizeof(int16) * indnatts; + + collations = (Oid *) palloc(oid_arr_size); + for (i = 0; i < indnatts; i++) + { + char *colname; + + heap_col_id = ind->rd_index->indkey.values[i]; + if (heap_col_id > 0) + { + Form_pg_attribute att; + + /* Normal attribute. */ + att = TupleDescAttr(OldHeap->rd_att, heap_col_id - 1); + colname = pstrdup(NameStr(att->attname)); + collations[i] = att->attcollation; + } + else if (heap_col_id == 0) + { + HeapTuple tuple; + Form_pg_attribute att; + + /* + * Expression column is not present in relcache. What we need + * here is an attribute of the *index* relation. + */ + tuple = SearchSysCache2(ATTNUM, + ObjectIdGetDatum(ind_oid), + Int16GetDatum(i + 1)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, + "cache lookup failed for attribute %d of relation %u", + i + 1, ind_oid); + att = (Form_pg_attribute) GETSTRUCT(tuple); + colname = pstrdup(NameStr(att->attname)); + collations[i] = att->attcollation; + ReleaseSysCache(tuple); + } + else + elog(ERROR, "Unexpected column number: %d", + heap_col_id); + + colnames = lappend(colnames, colname); + } + + /* + * Special effort needed for variable length attributes of + * Form_pg_index. + */ + tup = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(ind_oid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for index %u", ind_oid); + d = SysCacheGetAttr(INDEXRELID, tup, Anum_pg_index_indclass, &isnull); + Assert(!isnull); + oidvec = (oidvector *) DatumGetPointer(d); + opclasses = (Oid *) palloc(oid_arr_size); + memcpy(opclasses, oidvec->values, oid_arr_size); + + d = SysCacheGetAttr(INDEXRELID, tup, Anum_pg_index_indoption, + &isnull); + Assert(!isnull); + int2vec = (int2vector *) DatumGetPointer(d); + indoptions = (int16 *) palloc(int2_arr_size); + memcpy(indoptions, int2vec->values, int2_arr_size); + ReleaseSysCache(tup); + + tup = SearchSysCache1(RELOID, ObjectIdGetDatum(ind_oid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for index relation %u", ind_oid); + d = SysCacheGetAttr(RELOID, tup, Anum_pg_class_reloptions, &isnull); + reloptions = !isnull ? DatumGetTextPCopy(d) : NULL; + ReleaseSysCache(tup); + + opclassOptions = palloc0(sizeof(Datum) * ind_info->ii_NumIndexAttrs); + for (i = 0; i < ind_info->ii_NumIndexAttrs; i++) + opclassOptions[i] = get_attoptions(ind_oid, i + 1); + + stattargets = get_index_stattargets(ind_oid, ind_info); + + /* + * Neither parentIndexRelid nor parentConstraintId needs to be passed + * since the new catalog entries (pg_constraint, pg_inherits) would + * eventually be dropped. Therefore there's no need to record valid + * dependency on parents. + */ + ind_oid_new = index_create(NewHeap, + ind_name->data, + InvalidOid, + InvalidOid, /* parentIndexRelid */ + InvalidOid, /* parentConstraintId */ + InvalidOid, + ind_info, + colnames, + ind->rd_rel->relam, + tbsp_oid, + collations, + opclasses, + opclassOptions, + indoptions, + stattargets, + PointerGetDatum(reloptions), + flags, /* flags */ + 0, /* constr_flags */ + false, /* allow_system_table_mods */ + false, /* is_internal */ + NULL /* constraintId */ + ); + result = lappend_oid(result, ind_oid_new); + + index_close(ind, AccessShareLock); + list_free_deep(colnames); + pfree(collations); + pfree(opclasses); + pfree(indoptions); + if (reloptions) + pfree(reloptions); + } + + return result; +} + +static void +init_rel_reopen_info(RelReopenInfo *rri, Relation *rel_p, Oid relid, + LOCKMODE lockmode_orig, LOCKMODE lockmode_new) +{ + rri->rel_p = rel_p; + rri->relid = relid; + rri->lockmode_orig = lockmode_orig; + rri->lockmode_new = lockmode_new; +} + +/* + * Unlock and close relations specified by items of the 'rels' array. 'nrels' + * is the number of items. + * + * Information needed to (re)open the relations (or to issue meaningful ERROR) + * is added to the array items. + */ +static void +unlock_and_close_relations(RelReopenInfo *rels, int nrel) +{ + int i; + RelReopenInfo *rri; + + /* + * First, retrieve the information that we will need for re-opening. + * + * We could close (and unlock) each relation as soon as we have gathered + * the related information, but then we would have to be careful not to + * unlock the table until we have the info on all its indexes. (Once we + * unlock the table, any index can be dropped, and thus we can fail to get + * the name we want to report if re-opening fails.) It seem simpler to + * separate the work into two iterations. + */ + for (i = 0; i < nrel; i++) + { + Relation rel; + + rri = &rels[i]; + rel = *rri->rel_p; + + if (rel) + { + Assert(CheckRelationLockedByMe(rel, rri->lockmode_orig, false)); + Assert(!OidIsValid(rri->relid)); + + rri->relid = RelationGetRelid(rel); + rri->relkind = rel->rd_rel->relkind; + rri->relname = pstrdup(RelationGetRelationName(rel)); + } + else + { + Assert(OidIsValid(rri->relid)); + + rri->relname = get_rel_name(rri->relid); + rri->relkind = get_rel_relkind(rri->relid); + } + } + + /* Second, close the relations. */ + for (i = 0; i < nrel; i++) + { + Relation rel; + + rri = &rels[i]; + rel = *rri->rel_p; + + /* Close the relation if the caller passed one. */ + if (rel) + { + if (rri->relkind == RELKIND_RELATION) + table_close(rel, rri->lockmode_orig); + else + { + Assert(rri->relkind == RELKIND_INDEX); + + index_close(rel, rri->lockmode_orig); + } + } + } +} + +/* + * Re-open the relations closed previously by unlock_and_close_relations(). + */ +static void +reopen_relations(RelReopenInfo *rels, int nrel) +{ + for (int i = 0; i < nrel; i++) + { + RelReopenInfo *rri = &rels[i]; + Relation rel; + + if (rri->relkind == RELKIND_RELATION) + { + rel = try_table_open(rri->relid, rri->lockmode_new); + } + else + { + Assert(rri->relkind == RELKIND_INDEX); + + rel = try_index_open(rri->relid, rri->lockmode_new); + } + + if (rel == NULL) + { + const char *kind_str; + + kind_str = (rri->relkind == RELKIND_RELATION) ? "table" : "index"; + ereport(ERROR, + (errmsg("could not open \%s \"%s\"", kind_str, + rri->relname), + errhint("The %s could have been dropped by another transaction.", + kind_str))); + } + *rri->rel_p = rel; + + pfree(rri->relname); + } +} diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 694da8291e..4fafa2f807 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -905,7 +905,7 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, static void refresh_by_heap_swap(Oid matviewOid, Oid OIDNewHeap, char relpersistence) { - finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, true, + finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, true, true, RecentXmin, ReadNextMultiXactId(), relpersistence); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 6ccae4cb4a..b0d6318592 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -4480,6 +4480,16 @@ AlterTableInternal(Oid relid, List *cmds, bool recurse) rel = relation_open(relid, lockmode); + /* + * If lockmode allows, check if VACUUM FULL / CLUSTER CONCURRENTLY is in + * progress. If lockmode is too weak, cluster_rel() should detect + * incompatible DDLs executed by us. + * + * XXX We might skip the changes for DDLs which do not change the tuple + * descriptor. + */ + check_for_concurrent_cluster(relid, lockmode); + EventTriggerAlterTableRelid(relid); ATController(NULL, rel, cmds, recurse, lockmode, NULL); @@ -5909,6 +5919,7 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode, finish_heap_swap(tab->relid, OIDNewHeap, false, false, true, !OidIsValid(tab->newTableSpace), + true, RecentXmin, ReadNextMultiXactId(), persistence); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index a0158b1fcd..333ce98060 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -111,7 +111,7 @@ static void vac_truncate_clog(TransactionId frozenXID, TransactionId lastSaneFrozenXid, MultiXactId lastSaneMinMulti); static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, - BufferAccessStrategy bstrategy); + BufferAccessStrategy bstrategy, bool isTopLevel); static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); static bool vac_tid_reaped(ItemPointer itemptr, void *state); @@ -153,6 +153,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) bool analyze = false; bool freeze = false; bool full = false; + bool concurrent = false; bool disable_page_skipping = false; bool process_main = true; bool process_toast = true; @@ -226,6 +227,8 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) freeze = defGetBoolean(opt); else if (strcmp(opt->defname, "full") == 0) full = defGetBoolean(opt); + else if (strcmp(opt->defname, "concurrently") == 0) + concurrent = defGetBoolean(opt); else if (strcmp(opt->defname, "disable_page_skipping") == 0) disable_page_skipping = defGetBoolean(opt); else if (strcmp(opt->defname, "index_cleanup") == 0) @@ -300,7 +303,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) (skip_locked ? VACOPT_SKIP_LOCKED : 0) | (analyze ? VACOPT_ANALYZE : 0) | (freeze ? VACOPT_FREEZE : 0) | - (full ? VACOPT_FULL : 0) | + (full ? (concurrent ? VACOPT_FULL_CONCURRENT : VACOPT_FULL_EXCLUSIVE) : 0) | (disable_page_skipping ? VACOPT_DISABLE_PAGE_SKIPPING : 0) | (process_main ? VACOPT_PROCESS_MAIN : 0) | (process_toast ? VACOPT_PROCESS_TOAST : 0) | @@ -380,6 +383,12 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) errmsg("ONLY_DATABASE_STATS cannot be specified with other VACUUM options"))); } + /* This problem cannot be identified from the options. */ + if (concurrent && !full) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CONCURRENTLY can only be specified with VACUUM FULL"))); + /* * All freeze ages are zero if the FREEZE option is given; otherwise pass * them as -1 which means to use the default values. @@ -543,7 +552,17 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, relations = newrels; } else + { + /* + * Concurrent processing is currently considered rather special so it + * is not performed in bulk. + */ + if (params->options & VACOPT_FULL_CONCURRENT) + ereport(ERROR, + (errmsg("VACUUM (CONCURRENTLY) requires explicit list of tables"))); + relations = get_all_vacuum_rels(vac_context, params->options); + } /* * Decide whether we need to start/commit our own transactions. @@ -616,7 +635,8 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, if (params->options & VACOPT_VACUUM) { - if (!vacuum_rel(vrel->oid, vrel->relation, params, bstrategy)) + if (!vacuum_rel(vrel->oid, vrel->relation, params, bstrategy, + isTopLevel)) continue; } @@ -960,6 +980,17 @@ expand_vacuum_rel(VacuumRelation *vrel, MemoryContext vac_context, (errmsg("VACUUM ONLY of partitioned table \"%s\" has no effect", vrel->relation->relname))); + /* + * Concurrent processing is currently considered rather special + * (e.g. in terms of resources consumed) so it is not performed in + * bulk. + */ + if (is_partitioned_table && (options & VACOPT_FULL_CONCURRENT)) + ereport(ERROR, + (errmsg("VACUUM (CONCURRENTLY) not supported for partitioned tables"), + errhint("Consider running the command for individual partitions."))); + + ReleaseSysCache(tuple); /* @@ -1954,10 +1985,10 @@ vac_truncate_clog(TransactionId frozenXID, /* * vacuum_rel() -- vacuum one heap relation * - * relid identifies the relation to vacuum. If relation is supplied, - * use the name therein for reporting any failure to open/lock the rel; - * do not use it once we've successfully opened the rel, since it might - * be stale. + * relid identifies the relation to vacuum. If relation is supplied, use + * the name therein for reporting any failure to open/lock the rel; do + * not use it once we've successfully opened the rel, since it might be + * stale. * * Returns true if it's okay to proceed with a requested ANALYZE * operation on this table. @@ -1972,7 +2003,7 @@ vac_truncate_clog(TransactionId frozenXID, */ static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, - BufferAccessStrategy bstrategy) + BufferAccessStrategy bstrategy, bool isTopLevel) { LOCKMODE lmode; Relation rel; @@ -2035,10 +2066,11 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, /* * Determine the type of lock we want --- hard exclusive lock for a FULL - * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either - * way, we can be sure that no other backend is vacuuming the same table. + * exclusive vacuum, but a weaker lock (ShareUpdateExclusiveLock) for + * concurrent vacuum. Either way, we can be sure that no other backend is + * vacuuming the same table. */ - lmode = (params->options & VACOPT_FULL) ? + lmode = (params->options & VACOPT_FULL_EXCLUSIVE) ? AccessExclusiveLock : ShareUpdateExclusiveLock; /* open the relation and get the appropriate lock on it */ @@ -2053,6 +2085,22 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, return false; } + /* + * Skip the relation if VACUUM FULL / CLUSTER CONCURRENTLY is in progress + * as it will drop the current storage of the relation. + * + * This check should not take place until we have a lock that prevents + * another backend from starting VACUUM FULL / CLUSTER CONCURRENTLY later. + */ + Assert(lmode >= ShareUpdateExclusiveLock); + if (is_concurrent_cluster_in_progress(relid)) + { + relation_close(rel, lmode); + PopActiveSnapshot(); + CommitTransactionCommand(); + return false; + } + /* * When recursing to a TOAST table, check privileges on the parent. NB: * This is only safe to do because we hold a session lock on the main @@ -2126,19 +2174,6 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, return true; } - /* - * Get a session-level lock too. This will protect our access to the - * relation across multiple transactions, so that we can vacuum the - * relation's TOAST table (if any) secure in the knowledge that no one is - * deleting the parent relation. - * - * NOTE: this cannot block, even if someone else is waiting for access, - * because the lock manager knows that both lock requests are from the - * same process. - */ - lockrelid = rel->rd_lockInfo.lockRelId; - LockRelationIdForSession(&lockrelid, lmode); - /* * Set index_cleanup option based on index_cleanup reloption if it wasn't * specified in VACUUM command, or when running in an autovacuum worker @@ -2191,6 +2226,30 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, else toast_relid = InvalidOid; + /* + * Get a session-level lock too. This will protect our access to the + * relation across multiple transactions, so that we can vacuum the + * relation's TOAST table (if any) secure in the knowledge that no one is + * deleting the parent relation. + * + * NOTE: this cannot block, even if someone else is waiting for access, + * because the lock manager knows that both lock requests are from the + * same process. + */ + if (OidIsValid(toast_relid)) + { + /* + * You might worry that, in the VACUUM (FULL, CONCURRENTLY) case, + * cluster_rel() needs to release all the locks on the relation at + * some point, but this session lock makes it impossible. In fact, + * cluster_rel() will will eventually be called for the TOAST relation + * and raise ERROR because, in the concurrent mode, it cannot process + * TOAST relation alone anyway. + */ + lockrelid = rel->rd_lockInfo.lockRelId; + LockRelationIdForSession(&lockrelid, lmode); + } + /* * Switch to the table owner's userid, so that any index functions are run * as that user. Also lock down security-restricted operations and @@ -2218,11 +2277,22 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, { ClusterParams cluster_params = {0}; + /* + * Invalid toast_relid means that there is no session lock on the + * relation. Such a lock would be a problem because it would + * prevent cluster_rel() from releasing all locks when it tries to + * get AccessExclusiveLock. + */ + Assert(!OidIsValid(toast_relid)); + if ((params->options & VACOPT_VERBOSE) != 0) cluster_params.options |= CLUOPT_VERBOSE; + if ((params->options & VACOPT_FULL_CONCURRENT) != 0) + cluster_params.options |= CLUOPT_CONCURRENT; + /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ - cluster_rel(rel, InvalidOid, &cluster_params); + cluster_rel(rel, InvalidOid, &cluster_params, isTopLevel, true); /* cluster_rel closes the relation, but keeps lock */ rel = NULL; @@ -2268,13 +2338,15 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, toast_vacuum_params.options |= VACOPT_PROCESS_MAIN; toast_vacuum_params.toast_parent = relid; - vacuum_rel(toast_relid, NULL, &toast_vacuum_params, bstrategy); + vacuum_rel(toast_relid, NULL, &toast_vacuum_params, bstrategy, + isTopLevel); } /* * Now release the session-level lock on the main table. */ - UnlockRelationIdForSession(&lockrelid, lmode); + if (OidIsValid(toast_relid)) + UnlockRelationIdForSession(&lockrelid, lmode); /* Report that we really did it. */ return true; diff --git a/src/backend/meson.build b/src/backend/meson.build index 78c5726814..0f9141a4ac 100644 --- a/src/backend/meson.build +++ b/src/backend/meson.build @@ -194,5 +194,6 @@ pg_test_mod_args = pg_mod_args + { subdir('jit/llvm') subdir('replication/libpqwalreceiver') subdir('replication/pgoutput') +subdir('replication/pgoutput_cluster') subdir('snowball') subdir('utils/mb/conversion_procs') diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index e73576ad12..06a9d4a61f 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -33,6 +33,7 @@ #include "access/xlogreader.h" #include "access/xlogrecord.h" #include "catalog/pg_control.h" +#include "commands/cluster.h" #include "replication/decode.h" #include "replication/logical.h" #include "replication/message.h" @@ -467,6 +468,29 @@ heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; + /* + * Check if CLUSTER CONCURRENTLY is being performed by this backend. If + * so, only decode data changes of the table that it is processing, and + * the changes of its TOAST relation. + * + * (TOAST locator should not be set unless the main is.) + */ + Assert(!OidIsValid(clustered_rel_toast_locator.relNumber) || + OidIsValid(clustered_rel_locator.relNumber)); + + if (OidIsValid(clustered_rel_locator.relNumber)) + { + XLogReaderState *r = buf->record; + RelFileLocator locator; + + /* Not all records contain the block. */ + if (XLogRecGetBlockTagExtended(r, 0, &locator, NULL, NULL, NULL) && + !RelFileLocatorEquals(locator, clustered_rel_locator) && + (!OidIsValid(clustered_rel_toast_locator.relNumber) || + !RelFileLocatorEquals(locator, clustered_rel_toast_locator))) + return; + } + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); /* diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 097dc82f6f..61a57053c7 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -486,6 +486,26 @@ SnapBuildInitialSnapshot(SnapBuild *builder) return SnapBuildMVCCFromHistoric(snap, true); } +/* + * Build an MVCC snapshot for the initial data load performed by CLUSTER + * CONCURRENTLY command. + * + * The snapshot will only be used to scan one particular relation, which is + * treated like a catalog (therefore ->building_full_snapshot is not + * important), and the caller should already have a replication slot setup (so + * we do not set MyProc->xmin). XXX Do we yet need to add some restrictions? + */ +Snapshot +SnapBuildInitialSnapshotForCluster(SnapBuild *builder) +{ + Snapshot snap; + + Assert(builder->state == SNAPBUILD_CONSISTENT); + + snap = SnapBuildBuildSnapshot(builder); + return SnapBuildMVCCFromHistoric(snap, false); +} + /* * Turn a historic MVCC snapshot into an ordinary MVCC snapshot. * diff --git a/src/backend/replication/pgoutput_cluster/Makefile b/src/backend/replication/pgoutput_cluster/Makefile new file mode 100644 index 0000000000..31471bb546 --- /dev/null +++ b/src/backend/replication/pgoutput_cluster/Makefile @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/pgoutput_cluster +# +# IDENTIFICATION +# src/backend/replication/pgoutput_cluster +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/pgoutput_cluster +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + $(WIN32RES) \ + pgoutput_cluster.o +PGFILEDESC = "pgoutput_cluster - logical replication output plugin for CLUSTER command" +NAME = pgoutput_cluster + +all: all-shared-lib + +include $(top_srcdir)/src/Makefile.shlib + +install: all installdirs install-lib + +installdirs: installdirs-lib + +uninstall: uninstall-lib + +clean distclean: clean-lib + rm -f $(OBJS) diff --git a/src/backend/replication/pgoutput_cluster/meson.build b/src/backend/replication/pgoutput_cluster/meson.build new file mode 100644 index 0000000000..0f033064f2 --- /dev/null +++ b/src/backend/replication/pgoutput_cluster/meson.build @@ -0,0 +1,18 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +pgoutput_cluster_sources = files( + 'pgoutput_cluster.c', +) + +if host_system == 'windows' + pgoutput_cluster_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pgoutput_cluster', + '--FILEDESC', 'pgoutput_cluster - logical replication output plugin for CLUSTER command',]) +endif + +pgoutput_cluster = shared_module('pgoutput_cluster', + pgoutput_cluster_sources, + kwargs: pg_mod_args, +) + +backend_targets += pgoutput_cluster diff --git a/src/backend/replication/pgoutput_cluster/pgoutput_cluster.c b/src/backend/replication/pgoutput_cluster/pgoutput_cluster.c new file mode 100644 index 0000000000..43f7b34297 --- /dev/null +++ b/src/backend/replication/pgoutput_cluster/pgoutput_cluster.c @@ -0,0 +1,288 @@ +/* TODO Move into src/backend/cluster/ (and rename?) */ +/*------------------------------------------------------------------------- + * + * pgoutput_cluster.c + * Logical Replication output plugin for CLUSTER command + * + * Copyright (c) 2012-2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/pgoutput_cluster/pgoutput_cluster.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heaptoast.h" +#include "commands/cluster.h" +#include "replication/snapbuild.h" + +PG_MODULE_MAGIC; + +static void plugin_startup(LogicalDecodingContext *ctx, + OutputPluginOptions *opt, bool is_init); +static void plugin_shutdown(LogicalDecodingContext *ctx); +static void plugin_begin_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void plugin_commit_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr commit_lsn); +static void plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + Relation rel, ReorderBufferChange *change); +static void plugin_truncate(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, int nrelations, + Relation relations[], + ReorderBufferChange *change); +static void store_change(LogicalDecodingContext *ctx, + ConcurrentChangeKind kind, HeapTuple tuple); + +void +_PG_output_plugin_init(OutputPluginCallbacks *cb) +{ + AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit); + + cb->startup_cb = plugin_startup; + cb->begin_cb = plugin_begin_txn; + cb->change_cb = plugin_change; + cb->truncate_cb = plugin_truncate; + cb->commit_cb = plugin_commit_txn; + cb->shutdown_cb = plugin_shutdown; +} + + +/* initialize this plugin */ +static void +plugin_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init) +{ + ctx->output_plugin_private = NULL; + + /* Probably unnecessary, as we don't use the SQL interface ... */ + opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT; + + if (ctx->output_plugin_options != NIL) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("This plugin does not expect any options"))); + } +} + +static void +plugin_shutdown(LogicalDecodingContext *ctx) +{ +} + +/* + * As we don't release the slot during processing of particular table, there's + * no room for SQL interface, even for debugging purposes. Therefore we need + * neither OutputPluginPrepareWrite() nor OutputPluginWrite() in the plugin + * callbacks. (Although we might want to write custom callbacks, this API + * seems to be unnecessarily generic for our purposes.) + */ + +/* BEGIN callback */ +static void +plugin_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) +{ +} + +/* COMMIT callback */ +static void +plugin_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ +} + +/* + * Callback for individual changed tuples + */ +static void +plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + ClusterDecodingState *dstate; + + dstate = (ClusterDecodingState *) ctx->output_writer_private; + + /* Only interested in one particular relation. */ + if (relation->rd_id != dstate->relid) + return; + + /* Decode entry depending on its type */ + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + { + HeapTuple newtuple; + + newtuple = change->data.tp.newtuple != NULL ? + change->data.tp.newtuple : NULL; + + /* + * Identity checks in the main function should have made this + * impossible. + */ + if (newtuple == NULL) + elog(ERROR, "Incomplete insert info."); + + store_change(ctx, CHANGE_INSERT, newtuple); + } + break; + case REORDER_BUFFER_CHANGE_UPDATE: + { + HeapTuple oldtuple, + newtuple; + + oldtuple = change->data.tp.oldtuple != NULL ? + change->data.tp.oldtuple : NULL; + newtuple = change->data.tp.newtuple != NULL ? + change->data.tp.newtuple : NULL; + + if (newtuple == NULL) + elog(ERROR, "Incomplete update info."); + + if (oldtuple != NULL) + store_change(ctx, CHANGE_UPDATE_OLD, oldtuple); + + store_change(ctx, CHANGE_UPDATE_NEW, newtuple); + } + break; + case REORDER_BUFFER_CHANGE_DELETE: + { + HeapTuple oldtuple; + + oldtuple = change->data.tp.oldtuple ? + change->data.tp.oldtuple : NULL; + + if (oldtuple == NULL) + elog(ERROR, "Incomplete delete info."); + + store_change(ctx, CHANGE_DELETE, oldtuple); + } + break; + default: + /* Should not come here */ + Assert(false); + break; + } +} + +static void +plugin_truncate(struct LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change) +{ + ClusterDecodingState *dstate; + int i; + Relation relation = NULL; + + dstate = (ClusterDecodingState *) ctx->output_writer_private; + + /* Find the relation we are processing. */ + for (i = 0; i < nrelations; i++) + { + relation = relations[i]; + + if (RelationGetRelid(relation) == dstate->relid) + break; + } + + /* Is this truncation of another relation? */ + if (i == nrelations) + return; + + store_change(ctx, CHANGE_TRUNCATE, NULL); +} + +/* Store concurrent data change. */ +static void +store_change(LogicalDecodingContext *ctx, ConcurrentChangeKind kind, + HeapTuple tuple) +{ + ClusterDecodingState *dstate; + char *change_raw; + ConcurrentChange change; + bool flattened = false; + Size size; + Datum values[1]; + bool isnull[1]; + char *dst, *dst_start; + + dstate = (ClusterDecodingState *) ctx->output_writer_private; + + size = MAXALIGN(VARHDRSZ) + SizeOfConcurrentChange; + + if (tuple) + { + /* + * ReorderBufferCommit() stores the TOAST chunks in its private memory + * context and frees them after having called + * apply_change(). Therefore we need flat copy (including TOAST) that + * we eventually copy into the memory context which is available to + * decode_concurrent_changes(). + */ + if (HeapTupleHasExternal(tuple)) + { + /* + * toast_flatten_tuple_to_datum() might be more convenient but we + * don't want the decompression it does. + */ + tuple = toast_flatten_tuple(tuple, dstate->tupdesc); + flattened = true; + } + + size += tuple->t_len; + } + + /* XXX Isn't there any function / macro to do this? */ + if (size >= 0x3FFFFFFF) + elog(ERROR, "Change is too big."); + + /* Construct the change. */ + change_raw = (char *) palloc0(size); + SET_VARSIZE(change_raw, size); + /* + * Since the varlena alignment might not be sufficient for the structure, + * set the fields in a local instance and remember where it should + * eventually be copied. + */ + change.kind = kind; + dst_start = (char *) VARDATA(change_raw); + + /* No other information is needed for TRUNCATE. */ + if (change.kind == CHANGE_TRUNCATE) + { + memcpy(dst_start, &change, SizeOfConcurrentChange); + goto store; + } + + /* + * Copy the tuple. + * + * CAUTION: change->tup_data.t_data must be fixed on retrieval! + */ + memcpy(&change.tup_data, tuple, sizeof(HeapTupleData)); + dst = dst_start + SizeOfConcurrentChange; + memcpy(dst, tuple->t_data, tuple->t_len); + + /* The data has been copied. */ + if (flattened) + pfree(tuple); + +store: + /* Copy the structure so it can be stored. */ + memcpy(dst_start, &change, SizeOfConcurrentChange); + + /* Store as tuple of 1 bytea column. */ + values[0] = PointerGetDatum(change_raw); + isnull[0] = false; + tuplestore_putvalues(dstate->tstore, dstate->tupdesc_change, + values, isnull); + + /* Accounting. */ + dstate->nchanges++; + + /* Cleanup. */ + pfree(change_raw); +} + diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 7783ba854f..11ae537a8d 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -25,6 +25,7 @@ #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" #include "commands/async.h" +#include "commands/cluster.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/autovacuum.h" @@ -148,6 +149,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, WaitEventCustomShmemSize()); size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); + size = add_size(size, ClusterShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -340,6 +342,7 @@ CreateOrAttachShmemStructs(void) StatsShmemInit(); WaitEventCustomShmemInit(); InjectionPointShmemInit(); + ClusterShmemInit(); } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index f28bf37105..81f3a0a141 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1302,6 +1302,17 @@ ProcessUtilitySlow(ParseState *pstate, lockmode = AlterTableGetLockLevel(atstmt->cmds); relid = AlterTableLookupRelation(atstmt, lockmode); + /* + * If lockmode allows, check if VACUUM FULL / CLUSTER + * CONCURRENT is in progress. If lockmode is too weak, + * cluster_rel() should detect incompatible DDLs executed + * by us. + * + * XXX We might skip the changes for DDLs which do not + * change the tuple descriptor. + */ + check_for_concurrent_cluster(relid, lockmode); + if (OidIsValid(relid)) { AlterTableUtilityContext atcontext; diff --git a/src/backend/utils/activity/backend_progress.c b/src/backend/utils/activity/backend_progress.c index 55c8ddd89e..bab78bd34f 100644 --- a/src/backend/utils/activity/backend_progress.c +++ b/src/backend/utils/activity/backend_progress.c @@ -162,3 +162,19 @@ pgstat_progress_end_command(void) beentry->st_progress.command_target = InvalidOid; PGSTAT_END_WRITE_ACTIVITY(beentry); } + +void +pgstat_progress_restore_state(PgBackendProgress *backup) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + if (!beentry || !pgstat_track_activities) + return; + + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_progress.command = backup->command; + beentry->st_progress.command_target = backup->command_target; + memcpy(MyBEEntry->st_progress.param, backup->param, + sizeof(beentry->st_progress.param)); + PGSTAT_END_WRITE_ACTIVITY(beentry); +} diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 16144c2b72..5dc361d5d6 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -345,6 +345,7 @@ WALSummarizer "Waiting to read or update WAL summarization state." DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." +ClusteredRels "Waiting to read or update information on tables being clustered concurrently." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index fc972ed17d..d652bf60cf 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1565,6 +1565,28 @@ CacheInvalidateRelcache(Relation relation) databaseId, relationId); } +/* + * CacheInvalidateRelcacheImmediate + * Send invalidation message for the specified relation's relcache entry. + * + * Currently this is used in VACUUM FULL/CLUSTER CONCURRENTLY, to make sure + * that other backends are aware that the command is being executed for the + * relation. + */ +void +CacheInvalidateRelcacheImmediate(Relation relation) +{ + SharedInvalidationMessage msg; + + msg.rc.id = SHAREDINVALRELCACHE_ID; + msg.rc.dbId = MyDatabaseId; + msg.rc.relId = RelationGetRelid(relation); + /* check AddCatcacheInvalidationMessage() for an explanation */ + VALGRIND_MAKE_MEM_DEFINED(&msg, sizeof(msg)); + + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheInvalidateRelcacheAll * Register invalidation of the whole relcache at the end of command. diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 422509f18d..b20a7405e6 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -64,6 +64,7 @@ #include "catalog/pg_type.h" #include "catalog/schemapg.h" #include "catalog/storage.h" +#include "commands/cluster.h" #include "commands/policy.h" #include "commands/publicationcmds.h" #include "commands/trigger.h" @@ -1258,6 +1259,10 @@ retry: /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; + /* Is CLUSTER CONCURRENTLY in progress? */ + relation->rd_cluster_concurrent = + is_concurrent_cluster_in_progress(targetRelId); + /* * now we can free the memory allocated for pg_class_tuple */ diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 4c573b2ded..d7c1ba2f5b 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -154,7 +154,6 @@ static List *exportedSnapshots = NIL; /* Prototypes for local functions */ static void UnregisterSnapshotNoOwner(Snapshot snapshot); -static void FreeSnapshot(Snapshot snapshot); static void SnapshotResetXmin(void); /* ResourceOwner callbacks to track snapshot references */ @@ -591,7 +590,7 @@ CopySnapshot(Snapshot snapshot) * FreeSnapshot * Free the memory associated with a snapshot. */ -static void +void FreeSnapshot(Snapshot snapshot) { Assert(snapshot->regd_count == 0); diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c index bbd08770c3..cd3fdd3659 100644 --- a/src/bin/psql/tab-complete.in.c +++ b/src/bin/psql/tab-complete.in.c @@ -3104,7 +3104,7 @@ match_previous_words(int pattern_id, * one word, so the above test is correct. */ if (ends_with(prev_wd, '(') || ends_with(prev_wd, ',')) - COMPLETE_WITH("VERBOSE"); + COMPLETE_WITH("VERBOSE", "CONCURRENTLY"); } /* COMMENT */ @@ -5103,7 +5103,8 @@ match_previous_words(int pattern_id, "DISABLE_PAGE_SKIPPING", "SKIP_LOCKED", "INDEX_CLEANUP", "PROCESS_MAIN", "PROCESS_TOAST", "TRUNCATE", "PARALLEL", "SKIP_DATABASE_STATS", - "ONLY_DATABASE_STATS", "BUFFER_USAGE_LIMIT"); + "ONLY_DATABASE_STATS", "BUFFER_USAGE_LIMIT", + "CONCURRENTLY"); else if (TailMatches("FULL|FREEZE|ANALYZE|VERBOSE|DISABLE_PAGE_SKIPPING|SKIP_LOCKED|PROCESS_MAIN|PROCESS_TOAST|TRUNCATE|SKIP_DATABASE_STATS|ONLY_DATABASE_STATS")) COMPLETE_WITH("ON", "OFF"); else if (TailMatches("INDEX_CLEANUP")) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 96cf82f97b..e4a32fc391 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -413,6 +413,10 @@ extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer TransactionId *dead_after); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); +extern bool HeapTupleMVCCInserted(HeapTuple htup, Snapshot snapshot, + Buffer buffer); +extern bool HeapTupleMVCCNotDeleted(HeapTuple htup, Snapshot snapshot, + Buffer buffer); extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); extern bool HeapTupleIsSurelyDead(HeapTuple htup, struct GlobalVisState *vistest); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index adb478a93c..fbc898028f 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -21,6 +21,7 @@ #include "access/sdir.h" #include "access/xact.h" #include "executor/tuptable.h" +#include "replication/logical.h" #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -629,6 +630,8 @@ typedef struct TableAmRoutine Relation OldIndex, bool use_sort, TransactionId OldestXmin, + Snapshot snapshot, + LogicalDecodingContext *decoding_ctx, TransactionId *xid_cutoff, MultiXactId *multi_cutoff, double *num_tuples, @@ -1676,6 +1679,10 @@ table_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) * not needed for the relation's AM * - *xid_cutoff - ditto * - *multi_cutoff - ditto + * - snapshot - if != NULL, ignore data changes done by transactions that this + * (MVCC) snapshot considers still in-progress or in the future. + * - decoding_ctx - logical decoding context, to capture concurrent data + * changes. * * Output parameters: * - *xid_cutoff - rel's new relfrozenxid value, may be invalid @@ -1688,6 +1695,8 @@ table_relation_copy_for_cluster(Relation OldTable, Relation NewTable, Relation OldIndex, bool use_sort, TransactionId OldestXmin, + Snapshot snapshot, + LogicalDecodingContext *decoding_ctx, TransactionId *xid_cutoff, MultiXactId *multi_cutoff, double *num_tuples, @@ -1696,6 +1705,7 @@ table_relation_copy_for_cluster(Relation OldTable, Relation NewTable, { OldTable->rd_tableam->relation_copy_for_cluster(OldTable, NewTable, OldIndex, use_sort, OldestXmin, + snapshot, decoding_ctx, xid_cutoff, multi_cutoff, num_tuples, tups_vacuumed, tups_recently_dead); diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 2dea96f47c..943fe71ba6 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -100,6 +100,9 @@ extern Oid index_concurrently_create_copy(Relation heapRelation, Oid tablespaceOid, const char *newName); +extern NullableDatum *get_index_stattargets(Oid indexid, + IndexInfo *indInfo); + extern void index_concurrently_build(Oid heapRelationId, Oid indexRelationId); diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index 2d8e363015..c0f2cdabf0 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -13,10 +13,15 @@ #ifndef CLUSTER_H #define CLUSTER_H +#include "nodes/execnodes.h" #include "nodes/parsenodes.h" #include "parser/parse_node.h" +#include "replication/logical.h" #include "storage/lock.h" +#include "storage/relfilelocator.h" #include "utils/relcache.h" +#include "utils/resowner.h" +#include "utils/tuplestore.h" /* flag bits for ClusterParams->options */ @@ -24,6 +29,7 @@ #define CLUOPT_RECHECK 0x02 /* recheck relation state */ #define CLUOPT_RECHECK_ISCLUSTERED 0x04 /* recheck relation state for * indisclustered */ +#define CLUOPT_CONCURRENT 0x08 /* allow concurrent data changes */ /* options for CLUSTER */ typedef struct ClusterParams @@ -31,12 +37,91 @@ typedef struct ClusterParams bits32 options; /* bitmask of CLUOPT_* */ } ClusterParams; +/* + * The following definitions are used for concurrent processing. + */ + +extern RelFileLocator clustered_rel_locator; +extern RelFileLocator clustered_rel_toast_locator; + +typedef enum +{ + CHANGE_INSERT, + CHANGE_UPDATE_OLD, + CHANGE_UPDATE_NEW, + CHANGE_DELETE, + CHANGE_TRUNCATE +} ConcurrentChangeKind; + +typedef struct ConcurrentChange +{ + /* See the enum above. */ + ConcurrentChangeKind kind; + + /* + * The actual tuple. + * + * The tuple data follows the ConcurrentChange structure. Before use make + * sure the tuple is correctly aligned (ConcurrentChange can be stored as + * bytea) and that tuple->t_data is fixed. + */ + HeapTupleData tup_data; +} ConcurrentChange; + +#define SizeOfConcurrentChange (offsetof(ConcurrentChange, tup_data) + \ + sizeof(HeapTupleData)) + +/* + * Logical decoding state. + * + * Here we store the data changes that we decode from WAL while the table + * contents is being copied to a new storage. Also the necessary metadata + * needed to apply these changes to the table is stored here. + */ +typedef struct ClusterDecodingState +{ + /* The relation whose changes we're decoding. */ + Oid relid; + + /* + * Decoded changes are stored here. Although we try to avoid excessive + * batches, it can happen that the changes need to be stored to disk. The + * tuplestore does this transparently. + */ + Tuplestorestate *tstore; + + /* The current number of changes in tstore. */ + double nchanges; + + /* + * Descriptor to store the ConcurrentChange structure serialized (bytea). + * We can't store the tuple directly because tuplestore only supports + * minimum tuple and we may need to transfer OID system column from the + * output plugin. Also we need to transfer the change kind, so it's better + * to put everything in the structure than to use 2 tuplestores "in + * parallel". + */ + TupleDesc tupdesc_change; + + /* Tuple descriptor needed to update indexes. */ + TupleDesc tupdesc; + + /* Slot to retrieve data from tstore. */ + TupleTableSlot *tsslot; + + ResourceOwner resowner; +} ClusterDecodingState; + extern void cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel); -extern void cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params); +extern void cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params, + bool isTopLevel, bool isVacuum); extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode); extern void mark_index_clustered(Relation rel, Oid indexOid, bool is_internal); - +extern void check_relation_is_clusterable_concurrently(Relation rel, + bool is_vacuum); +extern void cluster_decode_concurrent_changes(LogicalDecodingContext *ctx, + XLogRecPtr end_of_wal); extern Oid make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod, char relpersistence, LOCKMODE lockmode); extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, @@ -44,8 +129,13 @@ extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool swap_toast_by_content, bool check_constraints, bool is_internal, + bool reindex, TransactionId frozenXid, MultiXactId cutoffMulti, char newrelpersistence); +extern Size ClusterShmemSize(void); +extern void ClusterShmemInit(void); +extern bool is_concurrent_cluster_in_progress(Oid relid); +extern void check_for_concurrent_cluster(Oid relid, LOCKMODE lockmode); #endif /* CLUSTER_H */ diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 5616d64523..03e3712ede 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -59,19 +59,22 @@ #define PROGRESS_CLUSTER_PHASE 1 #define PROGRESS_CLUSTER_INDEX_RELID 2 #define PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED 3 -#define PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN 4 -#define PROGRESS_CLUSTER_TOTAL_HEAP_BLKS 5 -#define PROGRESS_CLUSTER_HEAP_BLKS_SCANNED 6 -#define PROGRESS_CLUSTER_INDEX_REBUILD_COUNT 7 +#define PROGRESS_CLUSTER_HEAP_TUPLES_INSERTED 4 +#define PROGRESS_CLUSTER_HEAP_TUPLES_UPDATED 5 +#define PROGRESS_CLUSTER_HEAP_TUPLES_DELETED 6 +#define PROGRESS_CLUSTER_TOTAL_HEAP_BLKS 7 +#define PROGRESS_CLUSTER_HEAP_BLKS_SCANNED 8 +#define PROGRESS_CLUSTER_INDEX_REBUILD_COUNT 9 /* Phases of cluster (as advertised via PROGRESS_CLUSTER_PHASE) */ #define PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP 1 #define PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP 2 #define PROGRESS_CLUSTER_PHASE_SORT_TUPLES 3 #define PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP 4 -#define PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES 5 -#define PROGRESS_CLUSTER_PHASE_REBUILD_INDEX 6 -#define PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP 7 +#define PROGRESS_CLUSTER_PHASE_CATCH_UP 5 +#define PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES 6 +#define PROGRESS_CLUSTER_PHASE_REBUILD_INDEX 7 +#define PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP 8 /* Commands of PROGRESS_CLUSTER */ #define PROGRESS_CLUSTER_COMMAND_CLUSTER 1 diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 759f9a87d3..2f693e0fc0 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -181,13 +181,16 @@ typedef struct VacAttrStats #define VACOPT_ANALYZE 0x02 /* do ANALYZE */ #define VACOPT_VERBOSE 0x04 /* output INFO instrumentation messages */ #define VACOPT_FREEZE 0x08 /* FREEZE option */ -#define VACOPT_FULL 0x10 /* FULL (non-concurrent) vacuum */ -#define VACOPT_SKIP_LOCKED 0x20 /* skip if cannot get lock */ -#define VACOPT_PROCESS_MAIN 0x40 /* process main relation */ -#define VACOPT_PROCESS_TOAST 0x80 /* process the TOAST table, if any */ -#define VACOPT_DISABLE_PAGE_SKIPPING 0x100 /* don't skip any pages */ -#define VACOPT_SKIP_DATABASE_STATS 0x200 /* skip vac_update_datfrozenxid() */ -#define VACOPT_ONLY_DATABASE_STATS 0x400 /* only vac_update_datfrozenxid() */ +#define VACOPT_FULL_EXCLUSIVE 0x10 /* FULL (non-concurrent) vacuum */ +#define VACOPT_FULL_CONCURRENT 0x20 /* FULL (concurrent) vacuum */ +#define VACOPT_SKIP_LOCKED 0x40 /* skip if cannot get lock */ +#define VACOPT_PROCESS_MAIN 0x80 /* process main relation */ +#define VACOPT_PROCESS_TOAST 0x100 /* process the TOAST table, if any */ +#define VACOPT_DISABLE_PAGE_SKIPPING 0x200 /* don't skip any pages */ +#define VACOPT_SKIP_DATABASE_STATS 0x400 /* skip vac_update_datfrozenxid() */ +#define VACOPT_ONLY_DATABASE_STATS 0x800 /* only vac_update_datfrozenxid() */ + +#define VACOPT_FULL (VACOPT_FULL_EXCLUSIVE | VACOPT_FULL_CONCURRENT) /* * Values used by index_cleanup and truncate params. diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index cb2a400cdc..8b8a7d3634 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -73,6 +73,7 @@ extern void FreeSnapshotBuilder(SnapBuild *builder); extern void SnapBuildSnapDecRefcount(Snapshot snap); extern Snapshot SnapBuildInitialSnapshot(SnapBuild *builder); +extern Snapshot SnapBuildInitialSnapshotForCluster(SnapBuild *builder); extern Snapshot SnapBuildMVCCFromHistoric(Snapshot snapshot, bool in_place); extern const char *SnapBuildExportSnapshot(SnapBuild *builder); extern void SnapBuildClearExportedSnapshot(void); diff --git a/src/include/storage/lockdefs.h b/src/include/storage/lockdefs.h index 810b297edf..2a1583f367 100644 --- a/src/include/storage/lockdefs.h +++ b/src/include/storage/lockdefs.h @@ -36,8 +36,9 @@ typedef int LOCKMODE; #define AccessShareLock 1 /* SELECT */ #define RowShareLock 2 /* SELECT FOR UPDATE/FOR SHARE */ #define RowExclusiveLock 3 /* INSERT, UPDATE, DELETE */ -#define ShareUpdateExclusiveLock 4 /* VACUUM (non-FULL), ANALYZE, CREATE - * INDEX CONCURRENTLY */ +#define ShareUpdateExclusiveLock 4 /* VACUUM (non-exclusive), ANALYZE, CREATE + * INDEX CONCURRENTLY, CLUSTER + * CONCURRENTLY */ #define ShareLock 5 /* CREATE INDEX (WITHOUT CONCURRENTLY) */ #define ShareRowExclusiveLock 6 /* like EXCLUSIVE MODE, but allows ROW * SHARE */ diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 6a2f64c54f..b24c003c53 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -83,3 +83,4 @@ PG_LWLOCK(49, WALSummarizer) PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) +PG_LWLOCK(54, ClusteredRels) diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index e09598eafc..5ab5df9d41 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -35,7 +35,7 @@ typedef enum ProgressCommandType /* * Any command which wishes can advertise that it is running by setting - * command, command_target, and param[]. command_target should be the OID of + * ommand, command_target, and param[]. command_target should be the OID of * the relation which the command targets (we assume there's just one, as this * is meant for utility commands), but the meaning of each element in the * param array is command-specific. @@ -55,6 +55,7 @@ extern void pgstat_progress_parallel_incr_param(int index, int64 incr); extern void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val); extern void pgstat_progress_end_command(void); +extern void pgstat_progress_restore_state(PgBackendProgress *backup); #endif /* BACKEND_PROGRESS_H */ diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 299cd7585f..6c15b035f9 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -49,6 +49,8 @@ extern void CacheInvalidateCatalog(Oid catalogId); extern void CacheInvalidateRelcache(Relation relation); +extern void CacheInvalidateRelcacheImmediate(Relation relation); + extern void CacheInvalidateRelcacheAll(void); extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 8700204953..adda46c985 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -253,6 +253,9 @@ typedef struct RelationData bool pgstat_enabled; /* should relation stats be counted */ /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ + + /* Is CLUSTER CONCURRENTLY being performed on this relation? */ + bool rd_cluster_concurrent; } RelationData; @@ -684,7 +687,9 @@ RelationCloseSmgr(Relation relation) #define RelationIsAccessibleInLogicalDecoding(relation) \ (XLogLogicalInfoActive() && \ RelationNeedsWAL(relation) && \ - (IsCatalogRelation(relation) || RelationIsUsedAsCatalogTable(relation))) + (IsCatalogRelation(relation) || \ + RelationIsUsedAsCatalogTable(relation) || \ + (relation)->rd_cluster_concurrent)) /* * RelationIsLogicallyLogged diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 874c59b60d..91c70621ec 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -62,6 +62,8 @@ extern Snapshot GetLatestSnapshot(void); extern void SnapshotSetCommandId(CommandId curcid); extern Snapshot CopySnapshot(Snapshot snapshot); +extern void FreeSnapshot(Snapshot snapshot); + extern Snapshot GetCatalogSnapshot(Oid relid); extern Snapshot GetNonHistoricCatalogSnapshot(Oid relid); extern void InvalidateCatalogSnapshot(void); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 3014d047fe..81300642a5 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1962,17 +1962,20 @@ pg_stat_progress_cluster| SELECT s.pid, WHEN 2 THEN 'index scanning heap'::text WHEN 3 THEN 'sorting tuples'::text WHEN 4 THEN 'writing new heap'::text - WHEN 5 THEN 'swapping relation files'::text - WHEN 6 THEN 'rebuilding index'::text - WHEN 7 THEN 'performing final cleanup'::text + WHEN 5 THEN 'catch-up'::text + WHEN 6 THEN 'swapping relation files'::text + WHEN 7 THEN 'rebuilding index'::text + WHEN 8 THEN 'performing final cleanup'::text ELSE NULL::text END AS phase, (s.param3)::oid AS cluster_index_relid, s.param4 AS heap_tuples_scanned, - s.param5 AS heap_tuples_written, - s.param6 AS heap_blks_total, - s.param7 AS heap_blks_scanned, - s.param8 AS index_rebuild_count + s.param5 AS heap_tuples_inserted, + s.param6 AS heap_tuples_updated, + s.param7 AS heap_tuples_deleted, + s.param8 AS heap_blks_total, + s.param9 AS heap_blks_scanned, + s.param10 AS index_rebuild_count FROM (pg_stat_get_progress_info('CLUSTER'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); pg_stat_progress_copy| SELECT s.pid, -- 2.45.2