From 0e3c671082743f2826a7e8a96a19a071f5c8aeb3 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sat, 15 Mar 2025 16:39:45 +0100
Subject: [PATCH v4 7/8] Use anonymous files to back shared memory segments

Allow to use anonymous files for shared memory, instead of plain
anonymous memory. Such an anonymous file is created via memfd_create, it
lives in memory, behaves like a regular file and semantically equivalent
to an anonymous memory allocated via mmap with MAP_ANONYMOUS.

Advantages of using anon files are following:

* We've got a file descriptor, which could be used for regular file
  operations (modification, truncation, you name it).

* The file could be given a name, which improves readability when it
  comes to process maps. Here is how it looks like

7f90cde00000-7f90d5126000 rw-s 00000000 00:01 5463 /memfd:main (deleted)
7f90d5126000-7f914de00000 ---p 00000000 00:00 0
7f914de00000-7f9175128000 rw-s 00000000 00:01 5466 /memfd:buffers (deleted)
7f9175128000-7f944de00000 ---p 00000000 00:00 0
7f944de00000-7f9455528000 rw-s 00000000 00:01 5469 /memfd:descriptors (deleted)
7f9455528000-7f94cde00000 ---p 00000000 00:00 0
7f94cde00000-7f94d5228000 rw-s 00000000 00:01 5472 /memfd:iocv (deleted)
7f94d5228000-7f954de00000 ---p 00000000 00:00 0
7f954de00000-7f9555266000 rw-s 00000000 00:01 5475 /memfd:checkpoint (deleted)
7f9555266000-7f958de00000 ---p 00000000 00:00 0
7f958de00000-7f95954aa000 rw-s 00000000 00:01 5478 /memfd:strategy (deleted)
7f95954aa000-7f95cde00000 ---p 00000000 00:00 0

* By default, Linux will not add file-backed shared mappings into a core dump,
  making it more convenient to work with them in PostgreSQL: no more huge dumps
  to process.

The downside is that memfd_create is Linux specific.
---
 src/backend/port/sysv_shmem.c  | 73 +++++++++++++++++++++++++++++-----
 src/backend/port/win32_shmem.c |  2 +-
 src/backend/storage/ipc/ipci.c |  2 +-
 src/include/portability/mem.h  |  2 +-
 src/include/storage/pg_shmem.h |  3 +-
 5 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index a3437973784..87000a24eea 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -107,6 +107,7 @@ typedef struct AnonymousMapping
 	Pointer shmem; 				/* Pointer to the start of the mapped memory */
 	Pointer seg_addr; 			/* SysV shared memory for the header */
 	unsigned long seg_id; 		/* IPC key */
+	int segment_fd; 			/* fd for the backing anon file */
 } AnonymousMapping;
 
 static AnonymousMapping Mappings[ANON_MAPPINGS];
@@ -127,7 +128,7 @@ static int next_free_segment = 0;
  * 00400000-00490000         /path/bin/postgres
  * ...
  * 012d9000-0133e000         [heap]
- * 7f443a800000-7f470a800000 /dev/zero (deleted)
+ * 7f443a800000-7f470a800000 /memfd:main (deleted)
  * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive
  * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2
  * ...
@@ -150,9 +151,9 @@ static int next_free_segment = 0;
  * The result would look like this:
  *
  * 012d9000-0133e000         [heap]
- * 7f4426f54000-7f442e010000 /dev/zero (deleted)
+ * 7f4426f54000-7f442e010000 /memfd:main (deleted)
  * 7f442e010000-7f443a800000                     # reserved empty space
- * 7f443a800000-7f444196c000 /dev/zero (deleted)
+ * 7f443a800000-7f444196c000 /memfd:buffers (deleted)
  * 7f444196c000-7f470a800000                     # reserved empty space
  * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive
  * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2
@@ -643,13 +644,14 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
  * *hugepagesize and *mmap_flags are set to 0.
  */
 void
-GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
 {
 #ifdef MAP_HUGETLB
 
 	Size		default_hugepagesize = 0;
 	Size		hugepagesize_local = 0;
 	int			mmap_flags_local = 0;
+	int			memfd_flags_local = 0;
 
 	/*
 	 * System-dependent code to find out the default huge page size.
@@ -708,6 +710,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 	}
 
 	mmap_flags_local = MAP_HUGETLB;
+	memfd_flags_local = MFD_HUGETLB;
 
 	/*
 	 * On recent enough Linux, also include the explicit page size, if
@@ -718,7 +721,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 	{
 		int			shift = pg_ceil_log2_64(hugepagesize_local);
 
-		mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+		memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+	}
+#endif
+
+#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT)
+	if (hugepagesize_local != default_hugepagesize)
+	{
+		int			shift = pg_ceil_log2_64(hugepagesize_local);
+
+		memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
 	}
 #endif
 
@@ -727,6 +739,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 		*mmap_flags = mmap_flags_local;
 	if (hugepagesize)
 		*hugepagesize = hugepagesize_local;
+	if (memfd_flags)
+		*memfd_flags = memfd_flags_local;
 
 #else
 
@@ -734,6 +748,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 		*hugepagesize = 0;
 	if (mmap_flags)
 		*mmap_flags = 0;
+	if (memfd_flags)
+		*memfd_flags = 0;
 
 #endif							/* MAP_HUGETLB */
 }
@@ -771,7 +787,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
 	Size		allocsize = mapping->shmem_size;
 	void	   *ptr = MAP_FAILED;
 	int			mmap_errno = 0;
-	int			mmap_flags = PG_MMAP_FLAGS;
+	int			mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0;
 
 #ifndef MAP_HUGETLB
 	/* ReserveAnonymousMemory should have dealt with this case */
@@ -785,7 +801,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
 		Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY);
 
 		/* Round up the request size to a suitable large value */
-		GetHugePageSize(&hugepagesize, &mmap_flags);
+		GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags);
 
 		if (allocsize % hugepagesize != 0)
 			allocsize += hugepagesize - (allocsize % hugepagesize);
@@ -794,6 +810,29 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
 	}
 #endif
 
+	/*
+	 * Prepare an anonymous file backing the segment. Its size will be
+	 * specified later via ftruncate.
+	 *
+	 * The file behaves like a regular file, but lives in memory. Once all
+	 * references to the file are dropped,  it is automatically released.
+	 * Anonymous memory is used for all backing pages of the file, thus it has
+	 * the same semantics as anonymous memory allocations using mmap with the
+	 * MAP_ANONYMOUS flag.
+	 */
+	mapping->segment_fd = memfd_create(MappingName(mapping->shmem_segment),
+									   memfd_flags);
+
+	/*
+	 * Specify the segment file size using allocsize, which contains
+	 * potentially modified size.
+	 */
+	if(ftruncate(mapping->segment_fd, allocsize) == -1)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("could not truncase anonymous file for \"%s\": %m",
+						MappingName(mapping->shmem_segment))));
+
 	elog(DEBUG1, "segment[%s]: mmap(%zu) at address %p",
 		 MappingName(mapping->shmem_segment), allocsize, base + reserved_offset);
 
@@ -807,7 +846,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
 	 * a restart.
 	 */
 	ptr = mmap(base + reserved_offset, allocsize, PROT_READ | PROT_WRITE,
-			   mmap_flags | MAP_FIXED, -1, 0);
+			   mmap_flags | MAP_FIXED, mapping->segment_fd, 0);
 	mmap_errno = errno;
 
 	if (ptr == MAP_FAILED)
@@ -817,8 +856,15 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
 					 "fallback to the non-resizable allocation",
 			 MappingName(mapping->shmem_segment), allocsize, base + reserved_offset);
 
+		/* Specify the segment file size using allocsize. */
+		if(ftruncate(mapping->segment_fd, allocsize) == -1)
+			ereport(FATAL,
+					(errcode(ERRCODE_SYSTEM_ERROR),
+					 errmsg("could not truncase anonymous file for \"%s\": %m",
+							MappingName(mapping->shmem_segment))));
+
 		ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
-						   PG_MMAP_FLAGS, -1, 0);
+						   PG_MMAP_FLAGS, mapping->segment_fd, 0);
 		mmap_errno = errno;
 	}
 	else
@@ -889,7 +935,7 @@ ReserveAnonymousMemory(Size reserve_size)
 		Size		hugepagesize, total_size = 0;
 		int			mmap_flags;
 
-		GetHugePageSize(&hugepagesize, &mmap_flags);
+		GetHugePageSize(&hugepagesize, &mmap_flags, NULL);
 
 		/*
 		 * Figure out how much memory is needed for all segments, keeping in
@@ -1070,6 +1116,13 @@ AnonymousShmemResize(void)
 		if (m->shmem_size == new_size)
 			continue;
 
+		/* Resize the backing anon file. */
+		if(ftruncate(m->segment_fd, new_size) == -1)
+			ereport(FATAL,
+					(errcode(ERRCODE_SYSTEM_ERROR),
+					 errmsg("could not truncase anonymous file for \"%s\": %m",
+							MappingName(m->shmem_segment))));
+
 		/* Clean up some reserved space to resize into */
 		if (munmap(m->shmem + m->shmem_size, new_size - m->shmem_size) == -1)
 			ereport(FATAL,
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index ce719f1b412..ba972106de1 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -627,7 +627,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild)
  * use GetLargePageMinimum() instead.
  */
 void
-GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
 {
 	if (hugepagesize)
 		*hugepagesize = 0;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index abeb91e24fd..dc2b4becf4a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -396,7 +396,7 @@ InitializeShmemGUCs(void)
 	/*
 	 * Calculate the number of huge pages required.
 	 */
-	GetHugePageSize(&hp_size, NULL);
+	GetHugePageSize(&hp_size, NULL, NULL);
 	if (hp_size != 0)
 	{
 		Size		hp_required;
diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h
index ef9800732d9..40588ff6968 100644
--- a/src/include/portability/mem.h
+++ b/src/include/portability/mem.h
@@ -38,7 +38,7 @@
 #define MAP_NOSYNC			0
 #endif
 
-#define PG_MMAP_FLAGS			(MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
+#define PG_MMAP_FLAGS			(MAP_SHARED|MAP_HASSEMAPHORE)
 
 /* Some really old systems don't define MAP_FAILED. */
 #ifndef MAP_FAILED
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 2e47b222cbb..b9573520d9a 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -124,7 +124,8 @@ extern PGShmemHeader *PGSharedMemoryCreate(Size size,
 										   PGShmemHeader **shim, Pointer base);
 extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
 extern void PGSharedMemoryDetach(void);
-extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags);
+extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags,
+							int *memfd_flags);
 void *ReserveAnonymousMemory(Size reserve_size);
 
 bool ProcessBarrierShmemResize(Barrier *barrier);
-- 
2.45.1