From 6df85a35e8f6cca94a963d516f1b6974850ba05b Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 15 Oct 2024 16:18:45 +0200 Subject: [PATCH v1 5/5] Use anonymous files to back shared memory segments Allow to use anonymous files for shared memory, instead of plain anonymous memory. Such an anonymous file is created via memfd_create, it lives in memory, behaves like a regular file and semantically equivalent to an anonymous memory allocated via mmap with MAP_ANONYMOUS. Advantages of using anon files are following: * We've got a file descriptor, which could be used for regular file operations (modification, truncation, you name it). * The file could be given a name, which improves readability when it comes to process maps. Here is how it looks like 7f5a2bd04000-7f5a32e52000 rw-s 00000000 00:01 1845 /memfd:strategy (deleted) 7f5a39252000-7f5a4030e000 rw-s 00000000 00:01 1842 /memfd:checkpoint (deleted) 7f5a4670e000-7f5a4d7ba000 rw-s 00000000 00:01 1839 /memfd:iocv (deleted) 7f5a53bba000-7f5a5ad26000 rw-s 00000000 00:01 1836 /memfd:descriptors (deleted) 7f5a9ad26000-7f5aa9d94000 rw-s 00000000 00:01 1833 /memfd:buffers (deleted) 7f5d29d94000-7f5d30e00000 rw-s 00000000 00:01 1830 /memfd:main (deleted) * By default, Linux will not add file-backed shared mappings into a core dump, making it more convenient to work with them in PostgreSQL: no more huge dumps to process. The downside is that memfd_create is Linux specific. --- src/backend/port/sysv_shmem.c | 47 +++++++++++++++++++++++++++++------ src/include/portability/mem.h | 2 +- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 4bdadbb0e2..a01c3e4789 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -103,6 +103,7 @@ typedef struct AnonymousMapping void *shmem; /* Pointer to the start of the mapped memory */ void *seg_addr; /* SysV shared memory for the header */ unsigned long seg_id; /* IPC key */ + int segment_fd; /* fd for the backing anon file */ } AnonymousMapping; static AnonymousMapping Mappings[ANON_MAPPINGS]; @@ -116,7 +117,7 @@ static int next_free_slot = 0; * 00400000-00490000 /path/bin/postgres * ... * 012d9000-0133e000 [heap] - * 7f443a800000-7f470a800000 /dev/zero (deleted) + * 7f443a800000-7f470a800000 /memfd:main (deleted) * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 * ... @@ -143,9 +144,9 @@ static int next_free_slot = 0; * The result would look like this: * * 012d9000-0133e000 [heap] - * 7f4426f54000-7f442e010000 /dev/zero (deleted) + * 7f4426f54000-7f442e010000 /memfd:main (deleted) * [...free space...] - * 7f443a800000-7f444196c000 /dev/zero (deleted) + * 7f443a800000-7f444196c000 /memfd:buffers (deleted) * [...free space...] * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 @@ -708,6 +709,18 @@ CreateAnonymousSegment(AnonymousMapping *mapping) void *ptr = MAP_FAILED; int mmap_errno = 0; + /* + * Prepare an anonymous file backing the segment. Its size will be + * specified later via ftruncate. + * + * The file behaves like a regular file, but lives in memory. Once all + * references to the file are dropped, it is automatically released. + * Anonymous memory is used for all backing pages of the file, thus it has + * the same semantics as anonymous memory allocations using mmap with the + * MAP_ANONYMOUS flag. + */ + mapping->segment_fd = memfd_create(MappingName(mapping->shmem_slot), 0); + #ifndef MAP_HUGETLB /* PGSharedMemoryCreate should have dealt with this case */ Assert(huge_pages != HUGE_PAGES_ON); @@ -725,8 +738,13 @@ CreateAnonymousSegment(AnonymousMapping *mapping) if (allocsize % hugepagesize != 0) allocsize += hugepagesize - (allocsize % hugepagesize); + /* + * Do not use an anonymous file here yet. When adding it, do not forget + * to use ftruncate and flags MFD_HUGETLB & MFD_HUGE_2MB/MFD_HUGE_1GB + * in memfd_create. + */ ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | mmap_flags, -1, 0); + PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0); mmap_errno = errno; if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) { @@ -762,7 +780,8 @@ CreateAnonymousSegment(AnonymousMapping *mapping) * - First create the temporary probe mapping of a fixed size and let * kernel to place it at address of its choice. By the virtue of the * probe mapping size we expect it to be located at the lowest - * possible address, expecting some non mapped space above. + * possible address, expecting some non mapped space above. The probe + * is does not need to be backed by an anonymous file. * * - Unmap the probe mapping, remember the address. * @@ -777,7 +796,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping) * without a restart. */ probe = mmap(NULL, PROBE_MAPPING_SIZE, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); + PG_MMAP_FLAGS | MAP_ANONYMOUS, -1, 0); if (probe == MAP_FAILED) { @@ -793,8 +812,14 @@ CreateAnonymousSegment(AnonymousMapping *mapping) munmap(probe, PROBE_MAPPING_SIZE); + /* + * Specify the segment file size using allocsize, which contains + * potentially modified size. + */ + ftruncate(mapping->segment_fd, allocsize); + ptr = mmap(probe - offset, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | MAP_FIXED_NOREPLACE, -1, 0); + PG_MMAP_FLAGS | MAP_FIXED_NOREPLACE, mapping->segment_fd, 0); mmap_errno = errno; if (ptr == MAP_FAILED) { @@ -813,8 +838,11 @@ CreateAnonymousSegment(AnonymousMapping *mapping) */ allocsize = mapping->shmem_size; + /* Specify the segment file size using allocsize. */ + ftruncate(mapping->segment_fd, allocsize); + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); + PG_MMAP_FLAGS, mapping->segment_fd, 0); mmap_errno = errno; } @@ -903,6 +931,9 @@ AnonymousShmemResize(int newval, void *extra) if (m->shmem_size == new_size) continue; + /* Resize the backing anon file. */ + ftruncate(m->segment_fd, new_size); + if (mremap(m->shmem, m->shmem_size, new_size, 0) < 0) elog(LOG, "mremap(%p, %zu) failed: %m", m->shmem, m->shmem_size); diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h index 2cd05313b8..50db0da28d 100644 --- a/src/include/portability/mem.h +++ b/src/include/portability/mem.h @@ -38,7 +38,7 @@ #define MAP_NOSYNC 0 #endif -#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) +#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE) /* Some really old systems don't define MAP_FAILED. */ #ifndef MAP_FAILED -- 2.45.1