From 6f0b93afb6a4ba1157482e674e71f56cd9c555c9 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 14 Feb 2025 18:31:15 -0500
Subject: [PATCH v3 2/2] Have escape functions process bytes after invalid
 multi-byte char

Reviewed-by: Jeff Davis <pgsql@j-davis.com>
Backpatch: 13
---
 src/fe_utils/string_utils.c    | 40 ++++++++++++++++++----------------
 src/interfaces/libpq/fe-exec.c | 17 ++++++++-------
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index b6a7b197087..8621856fbc1 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -206,14 +206,13 @@ fmtIdEnc(const char *rawid, int encoding)
 				 * "skip" over quote characters, e.g. when parsing
 				 * character-by-character.
 				 *
-				 * Replace the bytes corresponding to the invalid character
-				 * with an invalid sequence, for the same reason as above.
+				 * Replace the current byte with with an invalid sequence, for the
+				 * same reason as above.
 				 *
-				 * It would be a bit faster to verify the whole string the
-				 * first time we encounter a set highbit, but this way we can
-				 * replace just the invalid characters, which probably makes
-				 * it easier for users to find the invalidly encoded portion
-				 * of a larger string.
+				 * It would be a bit faster to verify the whole string the first
+				 * time we encounter a set highbit, but this way we can replace
+				 * just the invalid byte, which probably makes it easier for users
+				 * to find the invalidly encoded portion of a larger string.
 				 */
 				enlargePQExpBuffer(id_return, 2);
 				pg_encoding_set_invalid(encoding,
@@ -222,11 +221,13 @@ fmtIdEnc(const char *rawid, int encoding)
 				id_return->data[id_return->len] = '\0';
 
 				/*
-				 * Copy the rest of the string after the invalid multi-byte
-				 * character.
+				 * Handle the following bytes as if this byte didn't exist,
+				 * that's safer in case the subsequent bytes contain
+				 * characters that are significant for the caller (e.g. '>' in
+				 * html).
 				 */
-				remaining -= charlen;
-				cp += charlen;
+				remaining -= 1;
+				cp += 1;
 			}
 			else
 			{
@@ -421,23 +422,24 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
 			 * over quote characters, e.g. when parsing
 			 * character-by-character.
 			 *
-			 * Replace the bytes corresponding to the invalid character with
-			 * an invalid sequence, for the same reason as above.
+			 * Replace the current byte with with an invalid sequence, for the
+			 * same reason as above.
 			 *
 			 * It would be a bit faster to verify the whole string the first
 			 * time we encounter a set highbit, but this way we can replace
-			 * just the invalid characters, which probably makes it easier for
-			 * users to find the invalidly encoded portion of a larger string.
+			 * just the invalid byte, which probably makes it easier for users
+			 * to find the invalidly encoded portion of a larger string.
 			 */
 			pg_encoding_set_invalid(encoding, target);
 			target += 2;
-			remaining -= charlen;
 
 			/*
-			 * Copy the rest of the string after the invalid multi-byte
-			 * character.
+			 * Handle the following bytes as if this byte didn't exist, that's
+			 * safer in case the subsequent bytes contain important characters
+			 * for the caller (e.g. '>' in html).
 			 */
-			source += charlen;
+			remaining -= 1;
+			source += 1;
 		}
 		else
 		{
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 120d4d032ec..53b906f9562 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -4139,13 +4139,13 @@ PQescapeStringInternal(PGconn *conn,
 			 * over quote characters, e.g. when parsing
 			 * character-by-character.
 			 *
-			 * Replace the bytes corresponding to the invalid character with
-			 * an invalid sequence, for the same reason as above.
+			 * Replace the current byte with with an invalid sequence, for the
+			 * same reason as above.
 			 *
 			 * It would be a bit faster to verify the whole string the first
 			 * time we encounter a set highbit, but this way we can replace
-			 * just the invalid characters, which probably makes it easier for
-			 * users to find the invalidly encoded portion of a larger string.
+			 * just the invalid byte, which probably makes it easier for users
+			 * to find the invalidly encoded portion of a larger string.
 			 */
 			if (error)
 				*error = 1;
@@ -4154,13 +4154,14 @@ PQescapeStringInternal(PGconn *conn,
 
 			pg_encoding_set_invalid(encoding, target);
 			target += 2;
-			remaining -= charlen;
 
 			/*
-			 * Copy the rest of the string after the invalid multi-byte
-			 * character.
+			 * Handle the following bytes as if this byte didn't exist, that's
+			 * safer in case the subsequent bytes contain important characters
+			 * for the caller (e.g. '>' in html).
 			 */
-			source += charlen;
+			remaining -= 1;
+			source += 1;
 		}
 		else
 		{
-- 
2.48.1.76.g4e746b1a31.dirty