From f88c48d1403ac72929af60f55bbd761f8e1914b8 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 20 Nov 2024 16:30:11 -0600
Subject: [PATCH v2] attempt multibyte-aware truncation of database names

---
 src/backend/utils/init/postinit.c | 76 +++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 3 deletions(-)
 100.0% src/backend/utils/init/

diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 5b657a3f13..a594c0d4f3 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -1003,14 +1003,84 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	{
 		HeapTuple	tuple;
 		Form_pg_database dbform;
+		char		trunc_dbname[NAMEDATALEN];
+
+		/* truncate to NAMEDATALEN-1 bytes first */
+		strncpy(trunc_dbname, in_dbname, NAMEDATALEN - 1);
+		trunc_dbname[NAMEDATALEN - 1] = '\0';
+
+		tuple = GetDatabaseTuple(trunc_dbname);
+		if (HeapTupleIsValid(tuple))
+			strcpy(dbname, trunc_dbname);
+
+		/*
+		 * If the original name is too long and we see two consecutive bytes
+		 * with their high bits set at the truncation point, we might have
+		 * truncated in the middle of a multibyte character. In multibyte
+		 * encodings, every byte of a multibyte character has its high bit
+		 * set. So if IS_HIGHBIT_SET is true for both NAMEDATALEN-1 and
+		 * NAMEDATALEN-2, we know we're in the middle of a multibyte
+		 * character. We need to try truncating one more byte back to find the
+		 * start of the next character.
+		 */
+		if (strlen(in_dbname) >= NAMEDATALEN &&
+			IS_HIGHBIT_SET(in_dbname[NAMEDATALEN - 1]) &&
+			IS_HIGHBIT_SET(in_dbname[NAMEDATALEN - 2]))
+		{
+			/*
+			 * Try progressively shorter truncations to find a character
+			 * boundary. We check at most MAX_MULTIBYTE_CHAR_LEN-1 bytes back.
+			 */
+			for (int i = 1; i < MAX_MULTIBYTE_CHAR_LEN - 1; i++)
+			{
+				HeapTuple	tmp;
+
+				/* truncate one more byte back */
+				trunc_dbname[NAMEDATALEN - 1 - i] = '\0';
+				tmp = GetDatabaseTuple(trunc_dbname);
+
+				/*
+				 * If we already had a match then the name is ambiguous and we
+				 * must fail.
+				 */
+				if (HeapTupleIsValid(tmp))
+				{
+					if (HeapTupleIsValid(tuple))
+						ereport(FATAL,
+								(errmsg("ambiguous database name")));
+					tuple = tmp;
+					strcpy(dbname, trunc_dbname);
+				}
+
+				/*
+				 * If we've hit a byte with high bit clear (an ASCII byte), we
+				 * know we can't be in the middle of a multibyte character,
+				 * because all bytes of a multibyte character must have their
+				 * high bits set. Any following byte must therefore be the
+				 * start of a new character, so we can stop looking for
+				 * earlier truncation points.
+				 */
+				if (!IS_HIGHBIT_SET(dbname[NAMEDATALEN - 2 - i]))
+					break;
+			}
+		}
 
-		tuple = GetDatabaseTuple(in_dbname);
 		if (!HeapTupleIsValid(tuple))
 			ereport(FATAL,
 					(errcode(ERRCODE_UNDEFINED_DATABASE),
 					 errmsg("database \"%s\" does not exist", in_dbname)));
+
 		dbform = (Form_pg_database) GETSTRUCT(tuple);
 		dboid = dbform->oid;
+
+		if (MyProcPort && MyProcPort->database_name &&
+			strcmp(MyProcPort->database_name, dbname) != 0)
+		{
+			pfree(MyProcPort->database_name);
+			MyProcPort->database_name = pstrdup(dbname);
+
+			/* XXX: should we fix process title? */
+		}
 	}
 	else if (!OidIsValid(dboid))
 	{
@@ -1067,12 +1137,12 @@ InitPostgres(const char *in_dbname, Oid dboid,
 			datform = (Form_pg_database) GETSTRUCT(tuple);
 
 		if (!HeapTupleIsValid(tuple) ||
-			(in_dbname && namestrcmp(&datform->datname, in_dbname)))
+			(in_dbname && namestrcmp(&datform->datname, dbname)))
 		{
 			if (in_dbname)
 				ereport(FATAL,
 						(errcode(ERRCODE_UNDEFINED_DATABASE),
-						 errmsg("database \"%s\" does not exist", in_dbname),
+						 errmsg("database \"%s\" does not exist", dbname),
 						 errdetail("It seems to have just been dropped or renamed.")));
 			else
 				ereport(FATAL,
-- 
2.34.1