From 18e3cf3eb1b3af0fb2f2f805e17eb7d0540edc91 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 19 Sep 2023 16:18:21 +0900
Subject: [PATCH v2] unaccent: Add support for quoted translated characters

---
 doc/src/sgml/unaccent.sgml                  | 16 ++++
 contrib/unaccent/expected/unaccent.out      | 36 +++++++++
 contrib/unaccent/generate_unaccent_rules.py |  4 +
 contrib/unaccent/sql/unaccent.sql           |  6 ++
 contrib/unaccent/unaccent.c                 | 84 ++++++++++++++++++---
 contrib/unaccent/unaccent.rules             | 56 +++++++-------
 6 files changed, 164 insertions(+), 38 deletions(-)

diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml
index f3ddc64bbc..94100ed260 100644
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@@ -84,6 +84,22 @@
     </para>
    </listitem>
 
+   <listitem>
+    <para>
+     Some characters, like numeric symbols, may require whitespaces in their
+     translation rule. It is possible to use double quotes around the translated
+     characters in this case. A double quote needs to be escaped with a second
+     double quote when including one in the translated character. For example:
+<programlisting>
+&frac14;      " 1/4"
+&frac12;      " 1/2"
+&frac34;      " 3/4"
+&ldquo;       """"
+&rdquo;       """"
+</programlisting>
+    </para>
+   </listitem>
+
    <listitem>
     <para>
      As with other <productname>PostgreSQL</productname> text search configuration files,
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
index f080707c4a..d03374c799 100644
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -51,6 +51,18 @@ SELECT unaccent('℗'); -- sound recording copyright
  (P)
 (1 row)
 
+SELECT unaccent('1½'); -- math expression with whitespace
+ unaccent 
+----------
+ 1 1/2
+(1 row)
+
+SELECT unaccent('〝'); -- quote
+ unaccent 
+----------
+ "
+(1 row)
+
 SELECT unaccent('unaccent', 'foobar');
  unaccent 
 ----------
@@ -93,6 +105,18 @@ SELECT unaccent('unaccent', '℗');
  (P)
 (1 row)
 
+SELECT unaccent('unaccent', '1½');
+ unaccent 
+----------
+ 1 1/2
+(1 row)
+
+SELECT unaccent('unaccent', '〝');
+ unaccent 
+----------
+ "
+(1 row)
+
 SELECT ts_lexize('unaccent', 'foobar');
  ts_lexize 
 -----------
@@ -135,6 +159,18 @@ SELECT ts_lexize('unaccent', '℗');
  {(P)}
 (1 row)
 
+SELECT ts_lexize('unaccent', '1½');
+ ts_lexize 
+-----------
+ {"1 1/2"}
+(1 row)
+
+SELECT ts_lexize('unaccent', '〝');
+ ts_lexize 
+-----------
+ {"\""}
+(1 row)
+
 -- Controversial case.  Black-Letter Capital H (U+210C) is translated by
 -- Latin-ASCII.xml as 'x', but it should be 'H'.
 SELECT unaccent('ℌ');
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index b4b4c38beb..cffb7db7ce 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -58,6 +58,10 @@ COMBINING_MARK_RANGES = ((0x0300, 0x0362),   # Mn: Accents, IPA
 
 def print_record(codepoint, letter):
     if letter:
+        # If the letter has whitespace or double quotes, escape double
+        # quotes and apply more quotes around it.
+        if (' ' in letter) or ('"' in letter):
+            letter = '"' + letter.replace('"', '""') + '"'
         output = chr(codepoint) + "\t" + letter
     else:
         output = chr(codepoint)
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
index 663646c1ac..70c7f1c0a0 100644
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -20,6 +20,8 @@ SELECT unaccent('˃˖˗˜');
 SELECT unaccent('À');  -- Remove combining diacritical 0x0300
 SELECT unaccent('℃℉'); -- degree signs
 SELECT unaccent('℗'); -- sound recording copyright
+SELECT unaccent('1½'); -- math expression with whitespace
+SELECT unaccent('〝'); -- quote
 
 SELECT unaccent('unaccent', 'foobar');
 SELECT unaccent('unaccent', 'ёлка');
@@ -28,6 +30,8 @@ SELECT unaccent('unaccent', '˃˖˗˜');
 SELECT unaccent('unaccent', 'À');
 SELECT unaccent('unaccent', '℃℉');
 SELECT unaccent('unaccent', '℗');
+SELECT unaccent('unaccent', '1½');
+SELECT unaccent('unaccent', '〝');
 
 SELECT ts_lexize('unaccent', 'foobar');
 SELECT ts_lexize('unaccent', 'ёлка');
@@ -36,6 +40,8 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
 SELECT ts_lexize('unaccent', 'À');
 SELECT ts_lexize('unaccent', '℃℉');
 SELECT ts_lexize('unaccent', '℗');
+SELECT ts_lexize('unaccent', '1½');
+SELECT ts_lexize('unaccent', '〝');
 
 -- Controversial case.  Black-Letter Capital H (U+210C) is translated by
 -- Latin-ASCII.xml as 'x', but it should be 'H'.
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
index 64c879e547..dd705c9cd6 100644
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -127,24 +127,30 @@ initTrie(const char *filename)
 				 * src and trg are sequences of one or more non-whitespace
 				 * characters, separated by whitespace.  Whitespace at start
 				 * or end of line is ignored.  If trg is omitted, an empty
-				 * string is used as the replacement.
+				 * string is used as the replacement.  trg can be optionally
+				 * quoted, in which case whitespaces are included in it.
 				 *
 				 * We use a simple state machine, with states
 				 *	0	initial (before src)
 				 *	1	in src
 				 *	2	in whitespace after src
-				 *	3	in trg
-				 *	4	in whitespace after trg
-				 *	-1	syntax error detected
+				 *	3	in trg (non-quoted)
+				 *	4	in trg (quoted)
+				 *	5	in whitespace after trg
+				 *	-1	syntax error detected (two strings)
+				 *	-2	syntax error detected (unfinished quoted string)
 				 *----------
 				 */
 				int			state;
 				char	   *ptr;
 				char	   *src = NULL;
 				char	   *trg = NULL;
+				char	   *trgstore = NULL;
 				int			ptrlen;
 				int			srclen = 0;
 				int			trglen = 0;
+				int			trgstorelen = 0;
+				bool		trgquoted = false;
 
 				state = 0;
 				for (ptr = line; *ptr; ptr += ptrlen)
@@ -156,8 +162,10 @@ initTrie(const char *filename)
 						if (state == 1)
 							state = 2;
 						else if (state == 3)
-							state = 4;
-						continue;
+							state = 5;
+						/* whitespaces are OK in quoted area */
+						if (state != 4)
+							continue;
 					}
 					switch (state)
 					{
@@ -173,14 +181,41 @@ initTrie(const char *filename)
 							break;
 						case 2:
 							/* start of trg */
+							if (*ptr == '"')
+							{
+								trgquoted = true;
+								state = 4;
+							}
+							else
+								state = 3;
+
 							trg = ptr;
 							trglen = ptrlen;
-							state = 3;
 							break;
 						case 3:
-							/* continue trg */
+							/* continue non-quoted trg */
 							trglen += ptrlen;
 							break;
+						case 4:
+							/* continue quoted trg */
+							trglen += ptrlen;
+
+							/*
+							 * If this is a quote, consider it as the end of
+							 * trg except if the follow-up character is itself
+							 * a quote.
+							 */
+							if (*ptr == '"')
+							{
+								if (*(ptr + 1) == '"')
+								{
+									ptr++;
+									trglen += 1;
+								}
+								else
+									state = 5;
+							}
+							break;
 						default:
 							/* bogus line format */
 							state = -1;
@@ -195,14 +230,43 @@ initTrie(const char *filename)
 					trglen = 0;
 				}
 
+				/* If still in a quoted area, fallback to an error */
+				if (state == 4)
+					state = -2;
+
+				/* If trg was quoted, remove its quotes and unescape it */
+				if (trgquoted && state > 0)
+				{
+					/* Ignore first and end quotes */
+					trgstore = palloc0(sizeof(char *) * trglen - 2);
+					trgstorelen = 0;
+					for (int i = 1; i < trglen - 1; i++)
+					{
+						trgstore[trgstorelen] = trg[i];
+						trgstorelen++;
+						/* skip second double quotes */
+						if (trg[i] == '"' && trg[i + 1] == '"')
+							i++;
+					}
+				}
+				else
+				{
+					trgstore = trg;
+					trgstorelen = trglen;
+				}
+
 				if (state > 0)
 					rootTrie = placeChar(rootTrie,
 										 (unsigned char *) src, srclen,
-										 trg, trglen);
-				else if (state < 0)
+										 trgstore, trgstorelen);
+				else if (state == -1)
 					ereport(WARNING,
 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
 							 errmsg("invalid syntax: more than two strings in unaccent rule")));
+				else if (state == -2)
+					ereport(WARNING,
+							(errcode(ERRCODE_CONFIG_FILE_ERROR),
+							 errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
 
 				pfree(line);
 			}
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 3030166ed6..ca6caa51f5 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -5,9 +5,9 @@
 ®	(R)
 ±	+/-
 »	>>
-¼	 1/4
-½	 1/2
-¾	 3/4
+¼	" 1/4"
+½	" 1/2"
+¾	" 3/4"
 ¿	?
 À	A
 Á	A
@@ -403,7 +403,7 @@
 ʪ	ls
 ʫ	lz
 ʹ	'
-ʺ	"
+ʺ	""""
 ʻ	'
 ʼ	'
 ʽ	'
@@ -1058,15 +1058,15 @@
 ’	'
 ‚	,
 ‛	'
-“	"
-”	"
+“	""""
+”	""""
 „	,,
-‟	"
+‟	""""
 ․	.
 ‥	..
 …	...
 ′	'
-″	"
+″	""""
 ‹	<
 ›	>
 ‼	!!
@@ -1134,22 +1134,22 @@
 ⅇ	e
 ⅈ	i
 ⅉ	j
-⅐	 1/7
-⅑	 1/9
-⅒	 1/10
-⅓	 1/3
-⅔	 2/3
-⅕	 1/5
-⅖	 2/5
-⅗	 3/5
-⅘	 4/5
-⅙	 1/6
-⅚	 5/6
-⅛	 1/8
-⅜	 3/8
-⅝	 5/8
-⅞	 7/8
-⅟	 1/
+⅐	" 1/7"
+⅑	" 1/9"
+⅒	" 1/10"
+⅓	" 1/3"
+⅔	" 2/3"
+⅕	" 1/5"
+⅖	" 2/5"
+⅗	" 3/5"
+⅘	" 4/5"
+⅙	" 1/6"
+⅚	" 5/6"
+⅛	" 1/8"
+⅜	" 3/8"
+⅝	" 5/8"
+⅞	" 7/8"
+⅟	" 1/"
 Ⅰ	I
 Ⅱ	II
 Ⅲ	III
@@ -1182,7 +1182,7 @@
 ⅽ	c
 ⅾ	d
 ⅿ	m
-↉	 0/3
+↉	" 0/3"
 −	-
 ∕	/
 ∖	\
@@ -1296,8 +1296,8 @@
 〙	]
 〚	[
 〛	]
-〝	"
-〞	"
+〝	""""
+〞	""""
 ㍱	hPa
 ㍲	da
 ㍳	AU
@@ -1512,7 +1512,7 @@
 ﹪	%
 ﹫	@
 ！	!
-＂	"
+＂	""""
 ＃	#
 ＄	$
 ％	%
-- 
2.40.1

