From 7183999bba1cbeebd059d18e5a590cbef7aff2d1 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 16 Oct 2024 20:24:58 +0200 Subject: [PATCH v1 4/5] Allow to resize shared memory without restart Add assing hook for shared_buffers to resize shared memory using space, introduced in the previous commits without requiring PostgreSQL restart. Size for every shared memory slot is recalculated based on the new NBuffers, and extended using mremap. After allocating new space, new shared structures (buffer blocks, descriptors, etc) are allocated as needed. Here is how it looks like after raising shared_buffers from 128 MB to 512 MB and calling pg_reload_conf(): -- 128 MB 7f5a2bd04000-7f5a32e52000 /dev/zero (deleted) 7f5a39252000-7f5a4030e000 /dev/zero (deleted) 7f5a4670e000-7f5a4d7ba000 /dev/zero (deleted) 7f5a53bba000-7f5a5ad26000 /dev/zero (deleted) 7f5a9ad26000-7f5aa9d94000 /dev/zero (deleted) ^ buffers mapping, ~240 MB 7f5d29d94000-7f5d30e00000 /dev/zero (deleted) -- 512 MB 7f5a2bd04000-7f5a33274000 /dev/zero (deleted) 7f5a39252000-7f5a4057e000 /dev/zero (deleted) 7f5a4670e000-7f5a4d9fa000 /dev/zero (deleted) 7f5a53bba000-7f5a5b1a6000 /dev/zero (deleted) 7f5a9ad26000-7f5ac1f14000 /dev/zero (deleted) ^ buffers mapping, ~625 MB 7f5d29d94000-7f5d30f80000 /dev/zero (deleted) The implementation supports only increasing of shared_buffers. For decreasing the value a similar procedure is needed. But the buffer blocks with data have to be drained first, so that the actual data set fits into the new smaller space. From experiment it turns out that shared mappings have to be extended separately for each process that uses them. Another rough edge is that a backend, executing pg_reload_conf interactively, will not resize mappings immediately, for some reason it will require another command. Note, that mremap is Linux specific, thus the implementation not very portable. --- src/backend/port/sysv_shmem.c | 62 +++++++++++++ src/backend/storage/buffer/buf_init.c | 86 +++++++++++++++++++ src/backend/storage/ipc/ipci.c | 11 +++ src/backend/storage/ipc/shmem.c | 14 ++- .../utils/activity/wait_event_names.txt | 1 + src/backend/utils/misc/guc_tables.c | 4 +- src/include/storage/bufmgr.h | 1 + src/include/storage/lwlocklist.h | 1 + src/include/storage/pg_shmem.h | 2 + 9 files changed, 171 insertions(+), 11 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index beebd4d85e..4bdadbb0e2 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -30,9 +30,11 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" #include "portability/mem.h" +#include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "utils/guc.h" #include "utils/guc_hooks.h" @@ -859,6 +861,66 @@ AnonymousShmemDetach(int status, Datum arg) } } +/* + * An assign callback for shared_buffers GUC -- a somewhat clumsy way of + * resizing shared memory without a restart. On NBuffers change use the new + * value to recalculate required size for every shmem slot, then base on the + * new and old values initialize new buffer blocks. + * + * The actual slot resizing is done via mremap, which will fail if is not + * sufficient space to expand the mapping. + * + * XXX: For some readon in the current implementation the change is applied to + * the backend calling pg_reload_conf only at the backend exit. + */ +void +AnonymousShmemResize(int newval, void *extra) +{ + int numSemas; + bool reinit = false; + int NBuffersOld = NBuffers; + + /* + * XXX: Currently only increasing of shared_buffers is supported. For + * decreasing something similar has to be done, but buffer blocks with + * data have to be drained first. + */ + if(NBuffers > newval) + return; + + /* XXX: Hack, NBuffers has to be exposed in the the interface for + * memory calculation and buffer blocks reinitialization instead. */ + NBuffers = newval; + + for(int i = 0; i < next_free_slot; i++) + { + Size new_size = CalculateShmemSize(&numSemas, i); + AnonymousMapping *m = &Mappings[i]; + + if (m->shmem == NULL) + continue; + + if (m->shmem_size == new_size) + continue; + + if (mremap(m->shmem, m->shmem_size, new_size, 0) < 0) + elog(LOG, "mremap(%p, %zu) failed: %m", + m->shmem, m->shmem_size); + else + { + reinit = true; + m->shmem_size = new_size; + } + } + + if (reinit) + { + LWLockAcquire(ShmemResizeLock, LW_EXCLUSIVE); + ResizeBufferPool(NBuffersOld); + LWLockRelease(ShmemResizeLock); + } +} + /* * PGSharedMemoryCreate * diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6bca286bef..4054abf0e8 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -154,6 +154,92 @@ InitBufferPool(void) &backend_flush_after); } +/* + * Reinitialize shared memory structures, which size depends on NBuffers. It's + * similar to InitBufferPool, but applied only to the buffers in the range + * between NBuffersOld and NBuffers. + */ +void +ResizeBufferPool(int NBuffersOld) +{ + bool foundBufs, + foundDescs, + foundIOCV, + foundBufCkpt; + int i; + + /* XXX: Only increasing of shared_buffers is supported in this function */ + if(NBuffersOld > NBuffers) + return; + + /* Align descriptors to a cacheline boundary. */ + BufferDescriptors = (BufferDescPadded *) + ShmemInitStructInSlot("Buffer Descriptors", + NBuffers * sizeof(BufferDescPadded), + &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SLOT); + + /* Align condition variables to cacheline boundary. */ + BufferIOCVArray = (ConditionVariableMinimallyPadded *) + ShmemInitStructInSlot("Buffer IO Condition Variables", + NBuffers * sizeof(ConditionVariableMinimallyPadded), + &foundIOCV, BUFFER_IOCV_SHMEM_SLOT); + + /* + * The array used to sort to-be-checkpointed buffer ids is located in + * shared memory, to avoid having to allocate significant amounts of + * memory at runtime. As that'd be in the middle of a checkpoint, or when + * the checkpointer is restarted, memory allocation failures would be + * painful. + */ + CkptBufferIds = (CkptSortItem *) + ShmemInitStructInSlot("Checkpoint BufferIds", + NBuffers * sizeof(CkptSortItem), &foundBufCkpt, + CHECKPOINT_BUFFERS_SHMEM_SLOT); + + /* Align buffer pool on IO page size boundary. */ + BufferBlocks = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStructInSlot("Buffer Blocks", + NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &foundBufs, BUFFERS_SHMEM_SLOT)); + + /* + * Initialize the headers for new buffers. + */ + for (i = NBuffersOld - 1; i < NBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); + + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + + buf->buf_id = i; + + /* + * Initially link all the buffers together as unused. Subsequent + * management of this list is done by freelist.c. + */ + buf->freeNext = i + 1; + + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + } + + /* Correct last entry of linked list */ + GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; + + /* Init other shared buffer-management stuff */ + StrategyInitialize(!foundDescs); + + /* Initialize per-backend file flush context */ + WritebackContextInit(&BackendWritebackContext, + &backend_flush_after); +} + /* * BufferShmemSize * diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index fbaddba396..56fa339f55 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -86,6 +86,9 @@ RequestAddinShmemSpace(Size size) * * If num_semaphores is not NULL, it will be set to the number of semaphores * required. + * + * XXX: Calculation for non main shared memory slots are incorrect, it includes + * more than needed for buffers only. */ Size CalculateShmemSize(int *num_semaphores, int shmem_slot) @@ -153,6 +156,14 @@ CalculateShmemSize(int *num_semaphores, int shmem_slot) size = add_size(size, SlotSyncShmemSize()); size = add_size(size, WaitLSNShmemSize()); + /* + * XXX: For some reason slightly more memory is needed for larger + * shared_buffers, but this size is enough for any large value I've tested + * with. Is it a mistake in how slots are split, or there was a hidden + * inconsistency in shmem calculation? + */ + size = add_size(size, 1024 * 1024 * 100); + /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c670b9cf43..20c4b1d5ad 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -491,17 +491,13 @@ ShmemInitStructInSlot(const char *name, Size size, bool *foundPtr, { /* * Structure is in the shmem index so someone else has allocated it - * already. The size better be the same as the size we are trying to - * initialize to, or there is a name conflict (or worse). + * already. Verify the structure's size: + * - If it's the same, we've found the expected structure. + * - If it's different, we're resizing the expected structure. */ if (result->size != size) - { - LWLockRelease(ShmemIndexLock); - ereport(ERROR, - (errmsg("ShmemIndex entry size is wrong for data structure" - " \"%s\": expected %zu, actual %zu", - name, size, result->size))); - } + result->size = size; + structPtr = result->location; } else diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index d10ca723dc..42296d950e 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -347,6 +347,7 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." WaitLSN "Waiting to read or update shared Wait-for-LSN state." +ShmemResize "Waiting to resize shared memory." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 636780673b..7f2c45b7f9 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2301,14 +2301,14 @@ struct config_int ConfigureNamesInt[] = * checking for overflow, so we mustn't allow more than INT_MAX / 2. */ { - {"shared_buffers", PGC_POSTMASTER, RESOURCES_MEM, + {"shared_buffers", PGC_SIGHUP, RESOURCES_MEM, gettext_noop("Sets the number of shared memory buffers used by the server."), NULL, GUC_UNIT_BLOCKS }, &NBuffers, 16384, 16, INT_MAX / 2, - NULL, NULL, NULL + NULL, AnonymousShmemResize, NULL }, { diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 4c09d270c9..ff75c46307 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -302,6 +302,7 @@ extern bool EvictUnpinnedBuffer(Buffer buf); /* in buf_init.c */ extern void InitBufferPool(void); extern Size BufferShmemSize(int); +extern void ResizeBufferPool(int); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 88dc79b2bd..fb310e8b9d 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -84,3 +84,4 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, WaitLSN) +PG_LWLOCK(54, ShmemResize) diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index c0143e3899..ff4736c6c8 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -105,6 +105,8 @@ extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +void AnonymousShmemResize(int newval, void *extra); + /* * To be able to dynamically resize largest parts of the data stored in shared * memory, we split it into multiple shared memory mappings slots. Each slot -- 2.45.1