From eae77d430e6e6cc3ec95b2cf613e4b3ae095e75e Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 16 Oct 2024 20:21:33 +0200 Subject: [PATCH v4 2/8] Address space reservation for shared memory Currently the kernel is responsible to chose an address, where to place each shared memory mapping, which is the lowest possible address that do not clash with any other mappings. This is considered to be the most portable approach, but one of the downsides is that there is no place to resize allocated mappings anymore. Here is how it looks like for one mapping in /proc/$PID/maps, /dev/zero represents the anonymous shared memory we talk about: 00400000-00490000 /path/bin/postgres ... 012d9000-0133e000 [heap] 7f443a800000-7f470a800000 /dev/zero (deleted) 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 ... 7f471aef2000-7f471aef9000 /dev/shm/PostgreSQL.3859891842 7f471aef9000-7f471aefa000 /SYSV007dbf7d (deleted) By specifying the mapping address directly it's possible to place the mapping in a way that leaves room for resizing. The idea is: * To reserve some address space via mmap'ing a large chunk of memory with PROT_NONE and MAP_NORESERVE. This way we prepare a playground for preparing shared memory layout without risking anything interfering with that. * To slice the reserved space up into sections, one to use for each shared segment. * Allocate shared memory segments out of corresponding slices and leaving unclaimed space in between them. This is implemented via mmap'ing memory at a specified address from the reserved space with MAP_FIXED. The result looks like this: 012d9000-0133e000 [heap] 7f443a800000-7f444196c000 /dev/zero (deleted) 7f444196c000-7f470a800000 # reserved space 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 Things like address space randomization should not be a problem in this context, since the randomization is applied to the mmap base, which is one per process. This approach also do not impact the actual memory usage as reported by the kernel. Here is the output of /proc/$PID/status for the master version with shared_buffers = 128 MB: // Peak virtual memory size, which is described as total pages // mapped in mm_struct. It corresponds to the mapped reserved space // and is the only number that grows with it. VmPeak: 2043192 kB // Size of memory portions. It contains RssAnon + RssFile + RssShmem VmRSS: 22908 kB // Size of resident anonymous memory RssAnon: 768 kB // Size of resident file mappings RssFile: 10364 kB // Size of resident shmem memory (includes SysV shm, mapping of tmpfs and // shared anonymous mappings) RssShmem: 11776 kB Here is the same for the patch when reserving 20GB of space: VmPeak: 21250648 kB VmRSS: 22948 kB RssAnon: 768 kB RssFile: 10404 kB RssShmem: 11776 kB Cgroup v2 doesn't have any problems with that as well. To verify a new cgroup was created with the memory limit 256 MB, then PostgreSQL was launched withing this cgroup with shared_buffers = 128 MB: $ cd /sys/fs/cgroup $ mkdir postgres $ cd postres $ echo 268435456 > memory.max $ echo $MASTER_PID_SHELL > cgroup.procs # postgres from the master branch has being successfully launched # from that shell $ cat memory.current 17465344 (~16.6 MB) # stop postgres $ echo $PATCH_PID_SHELL > cgroup.procs # postgres from the patch has being successfully launched from that shell $ cat memory.current 17637376 (~16.8 MB) To control the amount of space reserved a new GUC max_available_memory is introduced. Ideally it should be based on the maximum available memory, hense the name. --- src/backend/port/sysv_shmem.c | 284 ++++++++++++++++++++++++---- src/backend/port/win32_shmem.c | 2 +- src/backend/storage/ipc/ipci.c | 5 +- src/backend/utils/init/globals.c | 1 + src/backend/utils/misc/guc_tables.c | 14 ++ src/include/storage/pg_shmem.h | 4 +- 6 files changed, 271 insertions(+), 39 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 56af0231d24..a0f03ff868f 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -108,6 +108,66 @@ static AnonymousMapping Mappings[ANON_MAPPINGS]; /* Keeps track of used mapping segments */ static int next_free_segment = 0; +/* + * Anonymous mapping placing (/dev/zero (deleted) below) looks like this: + * + * 00400000-00490000 /path/bin/postgres + * ... + * 012d9000-0133e000 [heap] + * 7f443a800000-7f470a800000 /dev/zero (deleted) + * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive + * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 + * ... + * 7f471aef2000-7f471aef9000 /dev/shm/PostgreSQL.3859891842 + * 7f471aef9000-7f471aefa000 /SYSV007dbf7d (deleted) + * ... + * + * We would like to place multiple mappings in such a way, that there will be + * enough space between them in the address space to be able to resize up to + * certain size, but without counting towards the total memory consumption. + * + * To achieve that we first reserve some shared memory address space by + * mmap'ing a segment of MaxAvailableMemory size with PROT_NONE and + * MAP_NORESERVE (these flags allow to make sure this space will not be used by + * anything else, yet do not count against memory limits). Having the reserved + * space, we allocate out of it actual chunks of shared memory as usual, + * updating a pointer to the current available reserved space for the next + * allocation with the gap between segments in mind. + * + * The result would look like this: + * + * 012d9000-0133e000 [heap] + * 7f4426f54000-7f442e010000 /dev/zero (deleted) + * 7f442e010000-7f443a800000 # reserved empty space + * 7f443a800000-7f444196c000 /dev/zero (deleted) + * 7f444196c000-7f470a800000 # reserved empty space + * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive + * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 + * [...] + * + * The reserved space pointer is calculated to slice up the total reserved + * space into fixed fractions of address space for each segment, as specified + * in the SHMEM_RESIZE_RATIO array. + */ +static double SHMEM_RESIZE_RATIO[1] = { + 1.0, /* MAIN_SHMEM_SLOT */ +}; + +/* + * Offset from the beginning of the reserved space, which indicates currently + * available range. New shared memory segments have to be allocated at this + * offset related to the reserved space. + */ +static Size reserved_offset = 0; + +/* + * Flag telling that we have decided to use huge pages. + * + * XXX: It's possible to use GetConfigOption("huge_pages_status", false, false) + * instead, but it feels like an overkill. + */ +static bool huge_pages_on = false; + static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); static void IpcMemoryDelete(int status, Datum shmId); @@ -626,39 +686,198 @@ check_huge_page_size(int *newval, void **extra, GucSource source) * * This function will modify mapping size to the actual size of the allocation, * if it ends up allocating a segment that is larger than requested. + * + * Note that we do not switch from huge pages to regular pages in this + * function, this decision was already made in ReserveAnonymousMemory and we + * stick to it. */ static void -CreateAnonymousSegment(AnonymousMapping *mapping) +CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base) { Size allocsize = mapping->shmem_size; void *ptr = MAP_FAILED; int mmap_errno = 0; + int mmap_flags = PG_MMAP_FLAGS; #ifndef MAP_HUGETLB - /* PGSharedMemoryCreate should have dealt with this case */ - Assert(huge_pages != HUGE_PAGES_ON); + /* ReserveAnonymousMemory should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); #else - if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + if (huge_pages_on) { - /* - * Round up the request size to a suitable large value. - */ Size hugepagesize; - int mmap_flags; + /* Make sure nothing is messed up */ + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); + + /* Round up the request size to a suitable large value */ GetHugePageSize(&hugepagesize, &mmap_flags); if (allocsize % hugepagesize != 0) allocsize += hugepagesize - (allocsize % hugepagesize); + mmap_flags = PG_MMAP_FLAGS | mmap_flags; + } +#endif + + elog(DEBUG1, "segment[%s]: mmap(%zu) at address %p", + MappingName(mapping->shmem_segment), allocsize, base + reserved_offset); + + /* + * Try to create mapping at an address out of the reserved range, which + * will allow to extend it later. Use reserved_offset to allocate the + * segment, then update currently available reserved range. + * + * If the last step has failed, fallback to the regular mapping + * creation and signal that shared buffers could not be resized without + * a restart. + */ + ptr = mmap(base + reserved_offset, allocsize, PROT_READ | PROT_WRITE, + mmap_flags | MAP_FIXED, -1, 0); + mmap_errno = errno; + + if (ptr == MAP_FAILED) + { + DebugMappings(); + elog(DEBUG1, "segment[%s]: mmap(%zu) at address %p failed: %m, " + "fallback to the non-resizable allocation", + MappingName(mapping->shmem_segment), allocsize, base + reserved_offset); + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | mmap_flags, -1, 0); + PG_MMAP_FLAGS, -1, 0); + mmap_errno = errno; + } + else + { + Size total_reserved = (Size) MaxAvailableMemory * BLCKSZ; + + reserved_offset += total_reserved * SHMEM_RESIZE_RATIO[next_free_segment]; + } + + if (ptr == MAP_FAILED) + { + errno = mmap_errno; + DebugMappings(); + ereport(FATAL, + (errmsg("segment[%s]: could not map anonymous shared memory: %m", + MappingName(mapping->shmem_segment)), + (mmap_errno == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request " + "for a shared memory segment exceeded available memory, " + "swap space, or huge pages. To reduce the request size " + "(currently %zu bytes), reduce PostgreSQL's shared " + "memory usage, perhaps by reducing \"shared_buffers\" or " + "\"max_connections\".", + allocsize) : 0)); + } + + mapping->shmem = ptr; + mapping->shmem_size = allocsize; +} + +/* + * ReserveAnonymousMemory + * + * Reserve shared memory address space, from which shared memory segments are + * going to be sliced out. The goal of this exercise is to support segments + * resizing, for which we need a reserved space free of potential clashes with + * other mmap'd areas that are not under our control. Reservation is done via + * mmap, and will not allocate any memory until it will be actually used, and + * MAP_NORESERVE allows to make it not counting againt kernel reservation + * limits (e.g. in cgroups or for huge pages). Do not get confused because of + * MAP_NORESERVE -- we need to reserve some space, but not the actual memory, + * and that is that this flag is about. + * + * Note, that with MAP_NORESERVE a reservation with hugetlb will succeed even + * if there is actually not enough huge pages. Hence this function is + * responsible for deciding whether to use huge pages or not. To achieve that + * we need to probe first and try to allocate needed memory for all segments -- + * if this succeeds, we unmap the probe segment and use hugetlb; if it fails, + * we proceed with the regular memory. + */ +void * +ReserveAnonymousMemory(Size reserve_size) +{ + Size allocsize = reserve_size; + void *ptr = MAP_FAILED; + int mmap_errno = 0; + + /* Complain if hugepages demanded but we can't possibly support them */ +#if !defined(MAP_HUGETLB) + if (huge_pages == HUGE_PAGES_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported on this platform"))); +#else + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + Size hugepagesize, total_size = 0; + int mmap_flags; + + GetHugePageSize(&hugepagesize, &mmap_flags); + + /* + * Figure out how much memory is needed for all segments, keeping in + * mind that for every segment this value will be rounding up by the + * huge page size. The resulting value will be used to probe memory and + * decide whether we will allocate huge pages or not. + * + * We could actually have a mix and match of segments with and without + * huge pages. But in that case we need to have multiple reservation + * spaces to use corresponding memory (hugetlb adress space reserved + * for hugetlb segments, regular memory for others), and it doesn't + * seem to worth the complexity for now. + */ + for(int segment = 0; segment < ANON_MAPPINGS; segment++) + { + int numSemas; + Size segment_size = CalculateShmemSize(&numSemas, segment); + + if (segment_size % hugepagesize != 0) + segment_size += hugepagesize - (segment_size % hugepagesize); + + total_size += segment_size; + } + + /* Map total amount of memory to test its availability. */ + elog(DEBUG1, "reserving space: probe mmap(%zu) with MAP_HUGETLB", + total_size); + ptr = mmap(NULL, total_size, PROT_NONE, + PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0); mmap_errno = errno; if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) { - DebugMappings(); - elog(DEBUG1, "segment[%s]: mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", - MappingName(mapping->shmem_segment), allocsize); + /* No huge pages, we will go with the regular page size */ + elog(DEBUG1, "reserving space: probe mmap(%zu) with MAP_HUGETLB " + "failed, huge pages disabled: %m", total_size); + } + else + { + /* + * All fine, unmap the temporary segment and proceed with reserving + * using huge pages. + */ + if (munmap(ptr, total_size) < 0) + elog(LOG, "reservice space: munmap(%p, %zu) failed: %m", + ptr, total_size); + + /* Round up the requested size to a suitable large value. */ + if (allocsize % hugepagesize != 0) + allocsize += hugepagesize - (allocsize % hugepagesize); + + elog(DEBUG1, "reserving space: mmap(%zu) with MAP_HUGETLB", + allocsize); + ptr = mmap(NULL, allocsize, PROT_NONE, + PG_MMAP_FLAGS | MAP_ANONYMOUS | MAP_NORESERVE | mmap_flags, + -1, 0); + mmap_errno = errno; + + /* This should not happen, but handle errors anyway */ + if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) + { + elog(DEBUG1, "reserving space: mmap(%zu) with MAP_HUGETLB " + "failed, huge pages disabled: %m", allocsize); + } } } #endif @@ -666,10 +885,12 @@ CreateAnonymousSegment(AnonymousMapping *mapping) /* * Report whether huge pages are in use. This needs to be tracked before * the second mmap() call if attempting to use huge pages failed - * previously. + * previously. At this point ptr is either pointing to the probe segment, + * if we couldn't mmap it, or the reservation space. */ SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + huge_pages_on = ptr != MAP_FAILED; if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) { @@ -677,10 +898,11 @@ CreateAnonymousSegment(AnonymousMapping *mapping) * Use the original size, not the rounded-up value, when falling back * to non-huge pages. */ - allocsize = mapping->shmem_size; - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); - mmap_errno = errno; + allocsize = reserve_size; + + elog(DEBUG1, "reserving space: mmap(%zu)", allocsize); + ptr = mmap(NULL, allocsize, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); } if (ptr == MAP_FAILED) @@ -688,20 +910,18 @@ CreateAnonymousSegment(AnonymousMapping *mapping) errno = mmap_errno; DebugMappings(); ereport(FATAL, - (errmsg("segment[%s]: could not map anonymous shared memory: %m", - MappingName(mapping->shmem_segment)), + (errmsg("reserving space: could not map anonymous shared " + "memory: %m"), (mmap_errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request " - "for a shared memory segment exceeded available memory, " - "swap space, or huge pages. To reduce the request size " - "(currently %zu bytes), reduce PostgreSQL's shared " - "memory usage, perhaps by reducing \"shared_buffers\" or " - "\"max_connections\".", + "for a reserved shared memory address space exceeded " + "available memory, swap space, or huge pages. To " + "reduce the request reservation size (currently %zu " + "bytes), reduce PostgreSQL's \"maximum_shared_buffers\".", allocsize) : 0)); } - mapping->shmem = ptr; - mapping->shmem_size = allocsize; + return ptr; } /* @@ -740,7 +960,7 @@ AnonymousShmemDetach(int status, Datum arg) */ PGShmemHeader * PGSharedMemoryCreate(Size size, - PGShmemHeader **shim) + PGShmemHeader **shim, Pointer base) { IpcMemoryKey NextShmemSegID; void *memAddress; @@ -760,14 +980,6 @@ PGSharedMemoryCreate(Size size, errmsg("could not stat data directory \"%s\": %m", DataDir))); - /* Complain if hugepages demanded but we can't possibly support them */ -#if !defined(MAP_HUGETLB) - if (huge_pages == HUGE_PAGES_ON) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("huge pages not supported on this platform"))); -#endif - /* For now, we don't support huge pages in SysV memory */ if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) ereport(ERROR, @@ -782,7 +994,7 @@ PGSharedMemoryCreate(Size size, if (shared_memory_type == SHMEM_TYPE_MMAP) { /* On success, mapping data will be modified. */ - CreateAnonymousSegment(mapping); + CreateAnonymousSegment(mapping, base); next_free_segment++; diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 4dee856d6bd..ce719f1b412 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -205,7 +205,7 @@ EnableLockPagesPrivilege(int elevel) */ PGShmemHeader * PGSharedMemoryCreate(Size size, - PGShmemHeader **shim) + PGShmemHeader **shim, Pointer base) { void *memAddress; PGShmemHeader *hdr; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 8b38e985327..076888c0172 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -203,9 +203,12 @@ CreateSharedMemoryAndSemaphores(void) PGShmemHeader *seghdr; Size size; int numSemas; + void *base; Assert(!IsUnderPostmaster); + base = ReserveAnonymousMemory((Size) MaxAvailableMemory * BLCKSZ); + for(int segment = 0; segment < ANON_MAPPINGS; segment++) { /* Compute the size of the shared-memory block */ @@ -217,7 +220,7 @@ CreateSharedMemoryAndSemaphores(void) * * XXX: Do multiple shims are needed, one per segment? */ - seghdr = PGSharedMemoryCreate(size, &shim); + seghdr = PGSharedMemoryCreate(size, &shim, base); /* * Make sure that huge pages are never reported as "unknown" while the diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 2152aad97d9..1d42a5856c0 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -140,6 +140,7 @@ int max_parallel_maintenance_workers = 2; * register background workers. */ int NBuffers = 16384; +int MaxAvailableMemory = 131072; int MaxConnections = 100; int max_worker_processes = 8; int max_parallel_workers = 8; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4eaeca89f2c..dede37f7905 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2364,6 +2364,20 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_available_memory", PGC_SIGHUP, RESOURCES_MEM, + gettext_noop("Sets the upper limit for the shared_buffers value."), + gettext_noop("Shared memory could be resized at runtime, this " + "parameters sets the upper limit for it, beyond which " + "resizing would not be supported. Normally this value " + "would be the same as the total available memory."), + GUC_UNIT_BLOCKS + }, + &MaxAvailableMemory, + 131072, 16, INT_MAX / 2, + NULL, NULL, NULL + }, + { {"vacuum_buffer_usage_limit", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the buffer pool size for VACUUM, ANALYZE, and autovacuum."), diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 138078c29c5..4a83e255652 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -60,6 +60,7 @@ extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; +extern PGDLLIMPORT int MaxAvailableMemory; /* Possible values for huge_pages and huge_pages_status */ typedef enum @@ -100,10 +101,11 @@ extern void PGSharedMemoryNoReAttach(void); #endif extern PGShmemHeader *PGSharedMemoryCreate(Size size, - PGShmemHeader **shim); + PGShmemHeader **shim, Pointer base); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +void *ReserveAnonymousMemory(Size reserve_size); /* The main segment, contains everything except buffer blocks and related data. */ #define MAIN_SHMEM_SEGMENT 0 -- 2.45.1