From 78ea0efde8799445b90a70ca321e40b75fea52c9 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Thu, 20 Feb 2025 21:12:26 +0100 Subject: [PATCH v2 5/6] Allow to resize shared memory without restart Add assing hook for shared_buffers to resize shared memory using space, introduced in the previous commits without requiring PostgreSQL restart. Essentially the implementation is based on two mechanisms: a global Barrier to coordinate backends that simultaneously change shared_buffers, and pieces in shared memory to coordinate backends that are too late to the party for some reason. The resize process looks like this: * The GUC assign hook sets a flag to let the Postmaster know that resize was requested. * Postmaster verifies the flag in the event loop, and starts the resize by emitting a ProcSignal barrier. Afterwards it does shared memory resize itself. * All the backends, that participate in ProcSignal mechanism, recalculate shared memory size based on the new NBuffers and extend it using mremap. * When finished, a backend waits on a global ShmemControl barrier, untill all backends will be finished as well. This way we ensure three stages with clear boundaries: before the resize, when all processes use old NBuffers; during the resize, when processes have mix of old and new NBuffers, and wait until it's done; after the resize, when all processes use new NBuffers. * After all backends are using new value, one backend will initialize new shared structures (buffer blocks, descriptors, etc) as needed and broadcast new value of NBuffers via ShmemControl in shared memory. Other backends are waiting for this operation to finish as well. Then the barrier is lifted and everything goes as usual. Here is how it looks like after raising shared_buffers from 128 MB to 512 MB and calling pg_reload_conf(): -- 128 MB 7f5a2bd04000-7f5a32e52000 /dev/zero (deleted) 7f5a39252000-7f5a4030e000 /dev/zero (deleted) 7f5a4670e000-7f5a4d7ba000 /dev/zero (deleted) 7f5a53bba000-7f5a5ad26000 /dev/zero (deleted) 7f5a9ad26000-7f5aa9d94000 /dev/zero (deleted) ^ buffers mapping, ~240 MB 7f5d29d94000-7f5d30e00000 /dev/zero (deleted) -- 512 MB 7f5a2bd04000-7f5a33274000 /dev/zero (deleted) 7f5a39252000-7f5a4057e000 /dev/zero (deleted) 7f5a4670e000-7f5a4d9fa000 /dev/zero (deleted) 7f5a53bba000-7f5a5b1a6000 /dev/zero (deleted) 7f5a9ad26000-7f5ac1f14000 /dev/zero (deleted) ^ buffers mapping, ~625 MB 7f5d29d94000-7f5d30f80000 /dev/zero (deleted) The implementation supports only increasing of shared_buffers. For decreasing the value a similar procedure is needed. But the buffer blocks with data have to be drained first, so that the actual data set fits into the new smaller space. From experiment it turns out that shared mappings have to be extended separately for each process that uses them. Another rough edge is that a backend blocked on ReadCommand will not apply shared_buffers change until it reads something. Note, that mremap is Linux specific, thus the implementation not very portable. Authors: Dmitrii Dolgov, Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 300 ++++++++++++++++++ src/backend/postmaster/postmaster.c | 15 + src/backend/storage/buffer/buf_init.c | 152 ++++++++- src/backend/storage/ipc/ipci.c | 11 + src/backend/storage/ipc/procsignal.c | 45 +++ src/backend/storage/ipc/shmem.c | 14 +- src/backend/tcop/postgres.c | 15 + .../utils/activity/wait_event_names.txt | 3 + src/backend/utils/misc/guc_tables.c | 4 +- src/include/storage/bufmgr.h | 1 + src/include/storage/ipc.h | 2 + src/include/storage/lwlocklist.h | 1 + src/include/storage/pg_shmem.h | 24 ++ src/include/storage/procsignal.h | 1 + src/tools/pgindent/typedefs.list | 1 + 15 files changed, 577 insertions(+), 12 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 59aa67cb135..35a8ff92175 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -30,13 +30,17 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" #include "portability/mem.h" +#include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/procsignal.h" #include "utils/guc.h" #include "utils/guc_hooks.h" #include "utils/pidfile.h" +#include "utils/wait_event.h" /* @@ -105,6 +109,13 @@ typedef struct AnonymousMapping static AnonymousMapping Mappings[ANON_MAPPINGS]; +/* Flag telling postmaster that resize is needed */ +volatile bool pending_pm_shmem_resize = false; + +/* Keeps track of the previous NBuffers value */ +static int NBuffersOld = -1; +static int NBuffersPending = -1; + /* Keeps track of used mapping segments */ static int next_free_segment = 0; @@ -859,6 +870,274 @@ AnonymousShmemDetach(int status, Datum arg) } } +/* + * Resize all shared memory segments based on the current NBuffers value, which + * is is applied from NBuffersPending. The actual segment resizing is done via + * mremap, which will fail if is not sufficient space to expand the mapping. + * When finished, based on the new and old values initialize new buffer blocks + * if any. + * + * If reinitializing took place, as the last step this function broadcasts + * NSharedBuffers to it's new value, allowing any other backends to rely on + * this new value and skip buffers reinitialization. + */ +static bool +AnonymousShmemResize(void) +{ + int numSemas; + bool reinit = false; + NBuffers = NBuffersPending; + + elog(DEBUG1, "Resize shmem from %d to %d", NBuffersOld, NBuffers); + + /* + * XXX: Where to reset the flag is still an open question. E.g. do we + * consider a no-op when NBuffers is equal to NBuffersOld a genuine resize + * and reset the flag? + */ + pending_pm_shmem_resize = false; + + /* + * XXX: Currently only increasing of shared_buffers is supported. For + * decreasing something similar has to be done, but buffer blocks with + * data have to be drained first. + */ + if(NBuffersOld > NBuffers) + return false; + + for(int i = 0; i < next_free_segment; i++) + { + /* Note that CalculateShmemSize indirectly depends on NBuffers */ + Size new_size = CalculateShmemSize(&numSemas, i); + AnonymousMapping *m = &Mappings[i]; + + if (m->shmem == NULL) + continue; + + if (m->shmem_size == new_size) + continue; + + + /* + * Fail hard if faced any issues. In theory we could try to handle this + * more gracefully and proceed with shared memory as before, but some + * other backends might have succeeded and have different size. If we + * would like to go this way, to be consistent we would need to + * synchronize again, and it's not clear if it's worth the effort. + */ + if (mremap(m->shmem, m->shmem_size, new_size, 0) < 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not resize shared memory %p to %d (%zu): %m", + m->shmem, NBuffers, m->shmem_size))); + else + { + reinit = true; + m->shmem_size = new_size; + } + } + + if (reinit) + { + if(IsUnderPostmaster && + LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE)) + { + /* + * If the new NBuffers was already broadcasted, the buffer pool was + * already initialized before. + * + * Since we're not on a hot path, we use lwlocks and do not need to + * involve memory barrier. + */ + if(pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) + { + /* + * Allow the first backend that managed to get the lock to + * reinitialize the new portion of buffer pool. Every other + * process will wait on the shared barrier for that to finish, + * since it's a part of the SHMEM_RESIZE_DONE phase. + * + * XXX: This is the right place for buffer eviction as well. + */ + ResizeBufferPool(NBuffersOld, true); + + /* If all fine, broadcast the new value */ + pg_atomic_write_u32(&ShmemCtrl->NSharedBuffers, NBuffers); + } + else + ResizeBufferPool(NBuffersOld, false); + + LWLockRelease(ShmemResizeLock); + } + } + + return true; +} + +/* + * We are asked to resize shared memory. Do the resize and make sure to wait on + * the provided barrier until all simultaneously participating backends finish + * resizing as well, otherwise we face danger of inconsistency between + * backends. + * + * XXX: If a backend is blocked on ReadCommand in PostgresMain, it will not + * proceed with AnonymousShmemResize after receiving SIGHUP, until something + * will be sent. + */ +bool +ProcessBarrierShmemResize(Barrier *barrier) +{ + elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d", + NBuffersOld, NBuffersPending, pending_pm_shmem_resize); + + /* Wait until we have seen the new NBuffers value */ + if (!pending_pm_shmem_resize) + return false; + + /* + * After attaching to the barrier we could be in any of states: + * + * - Initial SHMEM_RESIZE_REQUESTED, nothing has been done yet + * - SHMEM_RESIZE_START, some of the backends have started to resize + * - SHMEM_RESIZE_DONE, participating backends have finished resizing + * - SHMEM_RESIZE_REQUESTED after the reset, the shared memory was already + * resized + * + * The first three states take place while the actual resize is in + * progress, and all we need to do is join and proceed with resizing. This + * way all simultaneously participating backends will remap and wait until + * one of them initialize new buffers. + * + * The last state happens when we are too late and everything is already + * done. In that case proceed as well, relying on AnonymousShmemResize not + * reinitialize anything since the NSharedBuffers is already broadcasted. + */ + BarrierAttach(barrier); + + /* First phase means the resize has begun, SHMEM_RESIZE_START */ + BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START); + + /* XXX: Split mremap and buffer reinitialization into two barrier phases */ + AnonymousShmemResize(); + + /* The second phase means the resize has finished, SHMEM_RESIZE_DONE */ + BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE); + + /* Allow the last backend to reset the barrier */ + if (BarrierArriveAndDetach(barrier)) + ResetShmemBarrier(); + + return true; +} + +/* + * GUC assign hook for shared_buffers. It's recommended for an assign hook to + * be as minimal as possible, thus we just request shared memory resize and + * remember the previous value. + */ +void +assign_shared_buffers(int newval, void *extra, bool *pending) +{ + elog(DEBUG1, "Received SIGHUP for shmem resizing"); + + /* Request shared memory resize only when it was initialized */ + if (next_free_segment != 0) + { + elog(DEBUG1, "Set pending signal"); + pending_pm_shmem_resize = true; + *pending = true; + NBuffersPending = newval; + } + + NBuffersOld = NBuffers; +} + +/* + * Test if we have somehow missed a shmem resize signal and NBuffers value + * differs from NSharedBuffers. If yes, catchup and do resize. + */ +void +AdjustShmemSize(void) +{ + uint32 NSharedBuffers = pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers); + + if (NSharedBuffers != NBuffers) + { + /* + * If the broadcasted shared_buffers is different from the one we see, + * it could be that the backend has missed a resize signal. To avoid + * any inconsistency, adjust the shared mappings, before having a + * chance to access the buffer pool. + */ + ereport(LOG, + (errmsg("shared_buffers has been changed from %d to %d, " + "resize shared memory", + NBuffers, NSharedBuffers))); + NBuffers = NSharedBuffers; + AnonymousShmemResize(); + } +} + +/* + * Coordinate all existing processes to make sure they all will have consistent + * view of shared memory size. Must be called only in postmaster. + */ +void +CoordinateShmemResize(void) +{ + elog(DEBUG1, "Coordinating shmem resize from %d to %d", + NBuffersOld, NBuffers); + Assert(!IsUnderPostmaster); + + /* + * If the value did not change, or shared memory segments are not + * initialized yet, skip the resize. + */ + if (NBuffersPending == NBuffersOld || next_free_segment == 0) + { + elog(DEBUG1, "Skip resizing, new %d, old %d, free segment %d", + NBuffers, NBuffersOld, next_free_segment); + return; + } + + /* + * Shared memory resize requires some coordination done by postmaster, + * and consists of three phases: + * + * - Before the resize all existing backends have the same old NBuffers. + * - When resize is in progress, backends are expected to have a + * mixture of old a new values. They're not allowed to touch buffer + * pool during this time frame. + * - After resize has been finished, all existing backends, that can access + * the buffer pool, are expected to have the same new value of NBuffers. + * There might still be some backends, that are sleeping or for some + * other reason not doing any work yet and have old NBuffers -- but as + * soon as they will get some time slice, they will acquire the new + * value. + */ + elog(DEBUG1, "Emit a barrier for shmem resizing"); + EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHMEM_RESIZE); + + AnonymousShmemResize(); + + /* + * Normally we would call WaitForProcSignalBarrier here to wait until every + * backend has reported on the ProcSignalBarrier. But for shared memory + * resize we don't need this, as every participating backend will + * synchronize on the ProcSignal barrier, and there is no sequential logic + * we have to perform afterwards. In fact even if we would like to wait + * here, it wouldn't be possible -- we're in the postmaster, without any + * waiting infrastructure available. + * + * If at some point it will turn out that waiting is essential, we would + * need to consider some alternatives. E.g. it could be a designated + * coordination process, which is not a postmaster. Another option would be + * to introduce a CoordinateShmemResize lock and allow only one process to + * take it (this probably would have to be something different than + * LWLocks, since they block interrupts, and coordination relies on them). + */ +} + /* * PGSharedMemoryCreate * @@ -1174,3 +1453,24 @@ PGSharedMemoryDetach(void) } } } + +void +WaitOnShmemBarrier(int phase) +{ + Barrier *barrier = &ShmemCtrl->Barrier; + + if (BarrierPhase(barrier) == phase) + { + ereport(LOG, + (errmsg("ProcSignal barrier is in phase %d, waiting", phase))); + BarrierAttach(barrier); + BarrierArriveAndWait(barrier, 0); + BarrierDetach(barrier); + } +} + +void +ResetShmemBarrier(void) +{ + BarrierInit(&ShmemCtrl->Barrier, 0); +} diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index bb22b13adef..f3e508141b2 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -418,6 +418,7 @@ static void process_pm_pmsignal(void); static void process_pm_child_exit(void); static void process_pm_reload_request(void); static void process_pm_shutdown_request(void); +static void process_pm_shmem_resize(void); static void dummy_handler(SIGNAL_ARGS); static void CleanupBackend(PMChild *bp, int exitstatus); static void HandleChildCrash(int pid, int exitstatus, const char *procname); @@ -1680,6 +1681,9 @@ ServerLoop(void) if (pending_pm_pmsignal) process_pm_pmsignal(); + if (pending_pm_shmem_resize) + process_pm_shmem_resize(); + if (events[i].events & WL_SOCKET_ACCEPT) { ClientSocket s; @@ -2026,6 +2030,17 @@ process_pm_reload_request(void) } } +static void +process_pm_shmem_resize(void) +{ + /* + * Failure to resize is considered to be fatal and will not be + * retried, which means we can disable pending flag right here. + */ + pending_pm_shmem_resize = false; + CoordinateShmemResize(); +} + /* * pg_ctl uses SIGTERM, SIGINT and SIGQUIT to request different types of * shutdown. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index f5b9290a640..b7de0ab6b0d 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -23,6 +23,41 @@ ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; +/* + * Currently broadcasted value of NBuffers in shared memory. + * + * Most of the time this value is going to be equal to NBuffers. But if + * postmaster is resizing shared memory and a new backend was created + * at the same time, there is a possibility for the new backend to inherit the + * old NBuffers value, but miss the resize signal if ProcSignal infrastructure + * was not initialized yet. Consider this situation: + * + * Postmaster ------> New Backend + * | | + * | Launch + * | | + * | Inherit NBuffers + * | | + * Resize NBuffers | + * | | + * Emit Barrier | + * | Init ProcSignal + * | | + * Finish resize | + * | | + * New NBuffers Old NBuffers + * + * In this case the backend is not yet ready to receive a signal from + * EmitProcSignalBarrier, and will be ignored. The same happens if ProcSignal + * is initialized even later, after the resizing was finished. + * + * To address resulting inconsistency, postmaster broadcasts the current + * NBuffers value via shared memory. Every new backend has to verify this value + * before it will access the buffer pool: if it differs from its own value, + * this indicates a shared memory resize has happened and the backend has to + * first synchronize with rest of the pack. + */ +ShmemControl *ShmemCtrl = NULL; /* * Data Structures: @@ -72,7 +107,19 @@ BufferManagerShmemInit(void) bool foundBufs, foundDescs, foundIOCV, - foundBufCkpt; + foundBufCkpt, + foundShmemCtrl; + + ShmemCtrl = (ShmemControl *) + ShmemInitStruct("Shmem Control", sizeof(ShmemControl), + &foundShmemCtrl); + + if (!foundShmemCtrl) + { + /* Initialize with the currently known value */ + pg_atomic_init_u32(&ShmemCtrl->NSharedBuffers, NBuffers); + BarrierInit(&ShmemCtrl->Barrier, 0); + } /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) @@ -153,6 +200,109 @@ BufferManagerShmemInit(void) &backend_flush_after); } +/* + * Reinitialize shared memory structures, which size depends on NBuffers. It's + * similar to InitBufferPool, but applied only to the buffers in the range + * between NBuffersOld and NBuffers. + * + * NBuffersOld tells what was the original value of NBuffersOld. It will be + * used to identify new and not yet initialized buffers. + * + * initNew flag indicates that the caller wants new buffers to be initialized. + * No locks are taking in this function, it is the caller responsibility to + * make sure only one backend can work with new buffers. + */ +void +ResizeBufferPool(int NBuffersOld, bool initNew) +{ + bool foundBufs, + foundDescs, + foundIOCV, + foundBufCkpt; + int i; + elog(DEBUG1, "Resizing buffer pool from %d to %d", NBuffersOld, NBuffers); + + /* XXX: Only increasing of shared_buffers is supported in this function */ + if(NBuffersOld > NBuffers) + return; + + /* Align descriptors to a cacheline boundary. */ + BufferDescriptors = (BufferDescPadded *) + ShmemInitStructInSegment("Buffer Descriptors", + NBuffers * sizeof(BufferDescPadded), + &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + + /* Align condition variables to cacheline boundary. */ + BufferIOCVArray = (ConditionVariableMinimallyPadded *) + ShmemInitStructInSegment("Buffer IO Condition Variables", + NBuffers * sizeof(ConditionVariableMinimallyPadded), + &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT); + + /* + * The array used to sort to-be-checkpointed buffer ids is located in + * shared memory, to avoid having to allocate significant amounts of + * memory at runtime. As that'd be in the middle of a checkpoint, or when + * the checkpointer is restarted, memory allocation failures would be + * painful. + */ + CkptBufferIds = (CkptSortItem *) + ShmemInitStructInSegment("Checkpoint BufferIds", + NBuffers * sizeof(CkptSortItem), &foundBufCkpt, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + + /* Align buffer pool on IO page size boundary. */ + BufferBlocks = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStructInSegment("Buffer Blocks", + NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &foundBufs, BUFFERS_SHMEM_SEGMENT)); + + /* + * It's enough to only resize shmem structures, if some other backend will + * do initialization of new buffers for us. + */ + if (!initNew) + return; + + elog(DEBUG1, "Initialize new buffers"); + + /* + * Initialize the headers for new buffers. + */ + for (i = NBuffersOld; i < NBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); + + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + + buf->buf_id = i; + + /* + * Initially link all the buffers together as unused. Subsequent + * management of this list is done by freelist.c. + */ + buf->freeNext = i + 1; + + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + } + + /* Correct last entry of linked list */ + GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; + + /* Init other shared buffer-management stuff */ + StrategyInitialize(!foundDescs); + + /* Initialize per-backend file flush context */ + WritebackContextInit(&BackendWritebackContext, + &backend_flush_after); +} + /* * BufferManagerShmemSize * diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 68778522591..a2c635f288e 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -83,6 +83,9 @@ RequestAddinShmemSpace(Size size) * * If num_semaphores is not NULL, it will be set to the number of semaphores * required. + * + * XXX: Calculation for non main shared memory segments are incorrect, it + * includes more than needed for buffers only. */ Size CalculateShmemSize(int *num_semaphores, int shmem_segment) @@ -149,6 +152,14 @@ CalculateShmemSize(int *num_semaphores, int shmem_segment) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); + /* + * XXX: For some reason slightly more memory is needed for larger + * shared_buffers, but this size is enough for any large value I've tested + * with. Is it a mistake in how slots are split, or there was a hidden + * inconsistency in shmem calculation? + */ + size = add_size(size, 1024 * 1024 * 100); + /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 7401b6e625e..bec0e00f901 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -27,6 +27,7 @@ #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/latch.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/sinval.h" #include "storage/smgr.h" @@ -108,6 +109,10 @@ static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); static void ResetProcSignalBarrierBits(uint32 flags); +#ifdef DEBUG_SHMEM_RESIZE +bool delay_proc_signal_init = false; +#endif + /* * ProcSignalShmemSize * Compute space needed for ProcSignal's shared memory @@ -168,6 +173,42 @@ ProcSignalInit(bool cancel_key_valid, int32 cancel_key) ProcSignalSlot *slot; uint64 barrier_generation; +#ifdef DEBUG_SHMEM_RESIZE + /* + * Introduced for debugging purposes. You can change the variable at + * runtime using gdb, then start new backends with delayed ProcSignal + * initialization. Simple pg_usleep wont work here due to SIGHUP interrupt + * needed for testing. Taken from pg_sleep; + */ + if (delay_proc_signal_init) + { +#define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0) + float8 endtime = GetNowFloat() + 5; + + for (;;) + { + float8 delay; + long delay_ms; + + CHECK_FOR_INTERRUPTS(); + + delay = endtime - GetNowFloat(); + if (delay >= 600.0) + delay_ms = 600000; + else if (delay > 0.0) + delay_ms = (long) (delay * 1000.0); + else + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + delay_ms, + WAIT_EVENT_PG_SLEEP); + ResetLatch(MyLatch); + } + } +#endif + if (MyProcNumber < 0) elog(ERROR, "MyProcNumber not set"); if (MyProcNumber >= NumProcSignalSlots) @@ -573,6 +614,10 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_SHMEM_RESIZE: + processed = ProcessBarrierShmemResize( + &ShmemCtrl->Barrier); + break; } /* diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 389abc82519..226b38ba979 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -493,17 +493,13 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, { /* * Structure is in the shmem index so someone else has allocated it - * already. The size better be the same as the size we are trying to - * initialize to, or there is a name conflict (or worse). + * already. Verify the structure's size: + * - If it's the same, we've found the expected structure. + * - If it's different, we're resizing the expected structure. */ if (result->size != size) - { - LWLockRelease(ShmemIndexLock); - ereport(ERROR, - (errmsg("ShmemIndex entry size is wrong for data structure" - " \"%s\": expected %zu, actual %zu", - name, size, result->size))); - } + result->size = size; + structPtr = result->location; } else diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 13fb8c31702..04cdd0d24d8 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -62,6 +62,7 @@ #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procsignal.h" @@ -4267,6 +4268,20 @@ PostgresMain(const char *dbname, const char *username) */ BeginReportingGUCOptions(); + /* + * Verify the shared barrier, if it's still active: join and wait. + * + * XXX: Any potential race condition if not a single backend has + * incremented the barrier phase? + */ + WaitOnShmemBarrier(SHMEM_RESIZE_START); + + /* + * After waiting on the barrier above we guaranteed to have NSharedBuffers + * broadcasted, so we can use it in the function below. + */ + AdjustShmemSize(); + /* * Also set up handler to log session end; we have to wait till now to be * sure Log_disconnections has its final value. diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index e199f071628..012acb98169 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -154,6 +154,8 @@ REPLICATION_ORIGIN_DROP "Waiting for a replication origin to become inactive so REPLICATION_SLOT_DROP "Waiting for a replication slot to become inactive so it can be dropped." RESTORE_COMMAND "Waiting for to complete." SAFE_SNAPSHOT "Waiting to obtain a valid snapshot for a READ ONLY DEFERRABLE transaction." +SHMEM_RESIZE_START "Waiting for other backends to start resizing shared memory." +SHMEM_RESIZE_DONE "Waiting for other backends to finish resizing shared memory." SYNC_REP "Waiting for confirmation from a remote server during synchronous replication." WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." @@ -346,6 +348,7 @@ WALSummarizer "Waiting to read or update WAL summarization state." DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." +ShmemResize "Waiting to resize shared memory." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 3cde94a1759..efdaa71c8fb 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2339,14 +2339,14 @@ struct config_int ConfigureNamesInt[] = * checking for overflow, so we mustn't allow more than INT_MAX / 2. */ { - {"shared_buffers", PGC_POSTMASTER, RESOURCES_MEM, + {"shared_buffers", PGC_SIGHUP, RESOURCES_MEM, gettext_noop("Sets the number of shared memory buffers used by the server."), NULL, GUC_UNIT_BLOCKS }, &NBuffers, 16384, 16, INT_MAX / 2, - NULL, NULL, NULL + NULL, assign_shared_buffers, NULL }, { diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index bb7fe02e243..fff80214822 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -298,6 +298,7 @@ extern bool EvictUnpinnedBuffer(Buffer buf); /* in buf_init.c */ extern void BufferManagerShmemInit(void); extern Size BufferManagerShmemSize(int); +extern void ResizeBufferPool(int, bool); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index c0439f2206b..5f5b45c88bd 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -64,6 +64,7 @@ typedef void (*shmem_startup_hook_type) (void); /* ipc.c */ extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; +extern PGDLLIMPORT volatile bool pending_pm_shmem_resize; extern void proc_exit(int code) pg_attribute_noreturn(); extern void shmem_exit(int code); @@ -83,5 +84,6 @@ extern void CreateSharedMemoryAndSemaphores(void); extern void AttachSharedMemoryStructs(void); #endif extern void InitializeShmemGUCs(void); +extern void CoordinateShmemResize(void); #endif /* IPC_H */ diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index cf565452382..61e89c6e8fd 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -83,3 +83,4 @@ PG_LWLOCK(49, WALSummarizer) PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) +PG_LWLOCK(53, ShmemResize) diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index ba0192baf95..b597df0d3a3 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -24,6 +24,7 @@ #ifndef PG_SHMEM_H #define PG_SHMEM_H +#include "storage/barrier.h" #include "storage/dsm_impl.h" #include "storage/spin.h" @@ -56,6 +57,23 @@ typedef struct ShmemSegment extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; +/* + * ShmemControl is shared between backends and helps to coordinate shared + * memory resize. + */ +typedef struct +{ + pg_atomic_uint32 NSharedBuffers; + Barrier Barrier; +} ShmemControl; + +extern PGDLLIMPORT ShmemControl *ShmemCtrl; + +/* The phases for shared memory resizing, used by for ProcSignal barrier. */ +#define SHMEM_RESIZE_REQUESTED 0 +#define SHMEM_RESIZE_START 1 +#define SHMEM_RESIZE_DONE 2 + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; @@ -105,6 +123,12 @@ extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +bool ProcessBarrierShmemResize(Barrier *barrier); +void assign_shared_buffers(int newval, void *extra, bool *pending); +void AdjustShmemSize(void); +extern void WaitOnShmemBarrier(int phase); +extern void ResetShmemBarrier(void); + /* * To be able to dynamically resize largest parts of the data stored in shared * memory, we split it into multiple shared memory mappings segments. Each diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 022fd8ed933..4c9973dc2d9 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,6 +54,7 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ + PROCSIGNAL_BARRIER_SHMEM_RESIZE, /* ask backends to resize shared memory */ } ProcSignalBarrierType; /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index fb39c915d76..5bf6d099808 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2671,6 +2671,7 @@ ShellTypeInfo ShippableCacheEntry ShippableCacheKey ShmemIndexEnt +ShmemControl ShutdownForeignScan_function ShutdownInformation ShutdownMode -- 2.45.1