From 2c1f47105cdc3a26bb4793cd52516ac30b8f4ec8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 22 Oct 2024 09:39:23 +0200 Subject: [PATCH v1] Support regular expressions with nondeterministic collations For normal regular expressions, the collation doesn't matter, so we can just allow it. But for SIMILAR TO, we cannot implement what the SQL standard requires, so we continue to not allow nondeterministic collations with SIMILAR TO. But since SIMILAR TO gets converted to a normal regular expression, we need to record somehow that the regular expression was originally a SIMILAR TO. To do that, we prepend a prefix "***S" which the RE parsing later recognizes and records. Based on that, we can issue the required error message. --- doc/src/sgml/charset.sgml | 2 +- doc/src/sgml/func.sgml | 21 +++++++++++++------ src/backend/regex/regc_lex.c | 4 ++++ src/backend/regex/regc_pg_locale.c | 14 ++++++++----- src/backend/regex/regcomp.c | 1 + src/backend/utils/adt/regexp.c | 11 +++++++++- src/include/regex/regex.h | 1 + src/include/regex/regguts.h | 1 + .../regress/expected/collate.icu.utf8.out | 16 ++++++++++---- 9 files changed, 54 insertions(+), 17 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 1ef5322b912..6c633678790 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -1197,7 +1197,7 @@ Nondeterministic Collations to a performance penalty. Note, in particular, that B-tree cannot use deduplication with indexes that use a nondeterministic collation. Also, certain operations are not possible with nondeterministic collations, - such as pattern matching operations. Therefore, they should be used + such as some pattern matching operations. Therefore, they should be used only in cases where they are specifically wanted. diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index ad663c94d77..803b6079f3a 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -5413,12 +5413,6 @@ Pattern Matching - - The pattern matching operators of all three kinds do not support - nondeterministic collations. If required, apply a different collation to - the expression to work around this limitation. - - <function>LIKE</function> @@ -5688,6 +5682,11 @@ <function>SIMILAR TO</function> Regular Expressions + + SIMILAR TO does not support nondeterministic + collations. + + The substring function with three parameters provides extraction of a substring that matches an SQL @@ -5900,6 +5899,16 @@ Regular Expression Match Operators + + Regular expression matching uses the applicable collation for character + classification, but not for string comparison. A single character in a + pattern always matches only exactly that character, independent of the + collation (unless a flag such as for case-insensitive matching is + specified; see below). It is different in this respect from + LIKE, which uses the collation to compare literal + substrings in the pattern. + + The POSIX pattern language is described in much greater detail below. diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 9087ef95af3..d373bbd1aaf 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -122,6 +122,10 @@ prefixes(struct vars *v) v->cflags |= REG_ADVANCED; v->now += 4; break; + case CHR('S'): /* "***S" was a converted SIMILAR TO pattern */ + NOTE(REG_UNONPOSIX | PG_REG_SIMILAR_TO); + v->now += 4; + break; default: /* otherwise *** is just an error */ ERR(REG_BADRPT); return; diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index b75784b6ce5..933fb4f68e1 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -260,11 +260,6 @@ pg_set_regex_collation(Oid collation) { locale = pg_newlocale_from_collation(collation); - if (!locale->deterministic) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondeterministic collations are not supported for regular expressions"))); - if (locale->ctype_is_c) { /* @@ -301,6 +296,15 @@ pg_set_regex_collation(Oid collation) pg_regex_collation = collation; } +void +pg_check_regex_collation(bool is_similar_to) +{ + if (is_similar_to && !pg_regex_locale->deterministic) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("nondeterministic collations are not supported for SIMILAR TO"))); +} + static int pg_wc_isdigit(pg_wchar c) { diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 8a6cfb2973d..03b7814605b 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -450,6 +450,7 @@ pg_regcomp(regex_t *re, /* parsing */ lexstart(v); /* also handles prefixes */ + pg_check_regex_collation(v->re->re_info & PG_REG_SIMILAR_TO); if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH)) { /* assign newline a unique color */ diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 0e2519bfd57..9ab6a078ebf 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -841,9 +841,18 @@ similar_escape_internal(text *pat_text, text *esc_text) * as 3 output bytes per input byte; since the input is at most 1GB this * can't overflow size_t. */ - result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen); + result = (text *) palloc(VARHDRSZ + 27 + 3 * (size_t) plen); r = VARDATA(result); + /* + * Prefix with "***S" so that the core regular expression code can tell + * that this was a SIMILAR TO pattern. + */ + *r++ = '*'; + *r++ = '*'; + *r++ = '*'; + *r++ = 'S'; + *r++ = '^'; *r++ = '('; *r++ = '?'; diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index f34ee3a85bd..341547a7fef 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -151,6 +151,7 @@ typedef struct #define REG_UEMPTYMATCH 004000 /* can match a zero-length string */ #define REG_UIMPOSSIBLE 010000 /* provably cannot match anything */ #define REG_USHORTEST 020000 /* has non-greedy quantifier */ +#define PG_REG_SIMILAR_TO 100000 /* PostgreSQL: RE was converted from SIMILAR TO */ int re_csize; /* sizeof(character) */ char *re_endp; /* backward compatibility kludge */ Oid re_collation; /* Collation that defines LC_CTYPE behavior */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 3ca3647e118..ca903407da2 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -545,4 +545,5 @@ struct guts /* prototypes for functions that are exported from regcomp.c to regexec.c */ extern void pg_set_regex_collation(Oid collation); +extern void pg_check_regex_collation(bool is_similar_to); extern color pg_reg_getcolor(struct colormap *cm, chr c); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index faa376e060c..601577cdec3 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1518,9 +1518,13 @@ ERROR: nondeterministic collations are not supported for LIKE SELECT x FROM test3ci WHERE x ILIKE 'a%'; ERROR: nondeterministic collations are not supported for ILIKE SELECT x FROM test3ci WHERE x SIMILAR TO 'a%'; -ERROR: nondeterministic collations are not supported for regular expressions +ERROR: nondeterministic collations are not supported for SIMILAR TO SELECT x FROM test3ci WHERE x ~ 'a'; -ERROR: nondeterministic collations are not supported for regular expressions + x +----- + abc +(1 row) + SELECT x FROM test1ci UNION SELECT x FROM test2ci ORDER BY x; x ----- @@ -1636,9 +1640,13 @@ ERROR: nondeterministic collations are not supported for LIKE SELECT x FROM test3bpci WHERE x ILIKE 'a%'; ERROR: nondeterministic collations are not supported for ILIKE SELECT x FROM test3bpci WHERE x SIMILAR TO 'a%'; -ERROR: nondeterministic collations are not supported for regular expressions +ERROR: nondeterministic collations are not supported for SIMILAR TO SELECT x FROM test3bpci WHERE x ~ 'a'; -ERROR: nondeterministic collations are not supported for regular expressions + x +----- + abc +(1 row) + SELECT x FROM test1bpci UNION SELECT x FROM test2bpci ORDER BY x; x ----- base-commit: d2b4b4c2259e21ceaf05e393769b69728bfbee99 -- 2.47.0