From e9980f76cbd1ea6f6d732e2a27dd1342258d26e5 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 16 Oct 2024 20:21:33 +0200 Subject: [PATCH v1 2/5] Allow placing shared memory mapping with an offset Currently the kernel is responsible to chose an address, where to place each shared memory mapping, which is the lowest possible address that do not clash with any other mappings. This is considered to be the most portable approach, but one of the downsides is that there is no place to resize allocated mappings anymore. Here is how it looks like for one mapping in /proc/$PID/maps, /dev/zero represents the anonymous shared memory we talk about: 00400000-00490000 /path/bin/postgres ... 012d9000-0133e000 [heap] 7f443a800000-7f470a800000 /dev/zero (deleted) 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 ... 7f471aef2000-7f471aef9000 /dev/shm/PostgreSQL.3859891842 7f471aef9000-7f471aefa000 /SYSV007dbf7d (deleted) By specifying the mapping address directly it's possible to place the mapping in a way that leaves room for resizing. The idea is first to get the address chosen by the kernel, then apply some offset derived from the expected upper limit. Because we base the layout on the address chosen by the kernel, things like address space randomization should not be a problem, since the randomization is applied to the mmap base, which is one per process. The result looks like this: 012d9000-0133e000 [heap] 7f443a800000-7f444196c000 /dev/zero (deleted) [...free space...] 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 This approach do not impact the actual memory usage as reported by the kernel. Here is the output of /proc/$PID/status for the master version with shared_buffers = 128 MB: // Peak virtual memory size, which is described as total pages mapped in mm_struct VmPeak: 422780 kB // Size of memory portions. It contains RssAnon + RssFile + RssShmem VmRSS: 21248 kB // Size of resident anonymous memory RssAnon: 640 kB // Size of resident file mappings RssFile: 9728 kB // Size of resident shmem memory (includes SysV shm, mapping of tmpfs and // shared anonymous mappings) RssShmem: 10880 kB Here is the same for the patch with the shared mapping placed at an offset 10 GB: VmPeak: 1102844 kB VmRSS: 21376 kB RssAnon: 640 kB RssFile: 9856 kB RssShmem: 10880 kB Cgroup v2 doesn't have any problems with that as well. To verify a new cgroup was created with the memory limit 256 MB, then PostgreSQL was launched withing this cgroup with shared_buffers = 128 MB: $ cd /sys/fs/cgroup $ mkdir postgres $ cd postres $ echo 268435456 > memory.max $ echo $MASTER_PID_SHELL > cgroup.procs # postgres from the master branch has being successfully launched # from that shell $ cat memory.current 17465344 (~16 MB) # stop postgres $ echo $PATCH_PID_SHELL > cgroup.procs # postgres from the patch has being successfully launched from that shell $ cat memory.current 18219008 (~17 MB) Note that currently the implementation makes assumptions about the upper limit. Ideally it should be based on the maximum available memory. --- src/backend/port/sysv_shmem.c | 120 +++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 065a5b63ac..7e6c8bb78d 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -108,6 +108,63 @@ static AnonymousMapping Mappings[ANON_MAPPINGS]; /* Keeps track of used mapping slots */ static int next_free_slot = 0; +/* + * Anonymous mapping placing (/dev/zero (deleted) below) looks like this: + * + * 00400000-00490000 /path/bin/postgres + * ... + * 012d9000-0133e000 [heap] + * 7f443a800000-7f470a800000 /dev/zero (deleted) + * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive + * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 + * ... + * 7f471aef2000-7f471aef9000 /dev/shm/PostgreSQL.3859891842 + * 7f471aef9000-7f471aefa000 /SYSV007dbf7d (deleted) + * ... + * + * We would like to place multiple mappings in such a way, that there will be + * enough space between them in the address space to be able to resize up to + * certain size, but without counting towards the total memory consumption. + * + * By letting Linux to chose a mapping address, it will pick up the lowest + * possible address that do not clash with any other mappings, which will be + * right before locales in the example above. This information (maximum allowed + * size of mappings and the lowest mapping address) is enough to place every + * mapping as follow: + * + * - Take the lowest mapping address, which we call later the probe address. + * - Substract the offset of the previous mapping. + * - Substract the maximum allowed size for the current mapping from the + * address. + * - Place the mapping by the resulting address. + * + * The result would look like this: + * + * 012d9000-0133e000 [heap] + * 7f4426f54000-7f442e010000 /dev/zero (deleted) + * [...free space...] + * 7f443a800000-7f444196c000 /dev/zero (deleted) + * [...free space...] + * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive + * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 + * ... + */ +Size SHMEM_EXTRA_SIZE_LIMIT[1] = { + 0, /* MAIN_SHMEM_SLOT */ +}; + +/* Remembers offset of the last mapping from the probe address */ +static Size last_offset = 0; + +/* + * Size of the mapping, which will be used to calculate anonymous mapping + * address. It should not be too small, otherwise there is a chance the probe + * mapping will be created between other mappings, leaving no room extending + * it. But it should not be too large either, in case if there are limitations + * on the mapping size. Current value is the default shared_buffers. + */ +#define PROBE_MAPPING_SIZE (Size) 128 * 1024 * 1024 + static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); static void IpcMemoryDelete(int status, Datum shmId); @@ -673,13 +730,74 @@ CreateAnonymousSegment(AnonymousMapping *mapping) if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) { + void *probe = NULL; + /* * Use the original size, not the rounded-up value, when falling back * to non-huge pages. */ allocsize = mapping->shmem_size; - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, + + /* + * Try to create mapping at an address, which will allow to extend it + * later: + * + * - First create the temporary probe mapping of a fixed size and let + * kernel to place it at address of its choice. By the virtue of the + * probe mapping size we expect it to be located at the lowest + * possible address, expecting some non mapped space above. + * + * - Unmap the probe mapping, remember the address. + * + * - Create an actual anonymous mapping at that address with the + * offset. The offset is calculated in such a way to allow growing + * the mapping withing certain boundaries. For this mapping we use + * MAP_FIXED_NOREPLACE, which will error out with EEXIST if there is + * any mapping clash. + * + * - If the last step has failed, fallback to the regular mapping + * creation and signal that shared buffers could not be resized + * without a restart. + */ + probe = mmap(NULL, PROBE_MAPPING_SIZE, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0); + + if (probe == MAP_FAILED) + { + mmap_errno = errno; + DebugMappings(); + elog(DEBUG1, "slot[%s]: probe mmap(%zu) failed: %m", + MappingName(mapping->shmem_slot), allocsize); + } + else + { + Size offset = last_offset + SHMEM_EXTRA_SIZE_LIMIT[next_free_slot] + allocsize; + last_offset = offset; + + munmap(probe, PROBE_MAPPING_SIZE); + + ptr = mmap(probe - offset, allocsize, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS | MAP_FIXED_NOREPLACE, -1, 0); + mmap_errno = errno; + if (ptr == MAP_FAILED) + { + DebugMappings(); + elog(DEBUG1, "slot[%s]: mmap(%zu) at address %p failed: %m", + MappingName(mapping->shmem_slot), allocsize, probe - offset); + } + + } + } + + if (ptr == MAP_FAILED) + { + /* + * Fallback to the portable way of creating a mapping. + */ + allocsize = mapping->shmem_size; + + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS, -1, 0); mmap_errno = errno; } -- 2.45.1