From cc8b59a382bb9283bb61e826917268a4c98b8a57 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Mon, 9 Dec 2024 10:48:34 +0100 Subject: [PATCH v2 4/4] Remove ts_locale.c's lowerstr() lowerstr() and lowerstr_with_len() in ts_locale.c do the same thing as str_tolower() that the rest of the system uses, except that the former don't use the common locale provider framework but instead use the global libc locale settings. This patch replaces uses of lowerstr*() with str_tolower(..., DEFAULT_COLLATION_OID). For instances that use a libc locale globally, this will result in exactly the same behavior. For instances that use other locale providers, you now get consistent behavior and are no longer dependent on the libc locale settings (for this case; there are others). Most uses of these functions are for processing dictionary and configuration files. In those cases, using the default collation seems appropriate. At least we don't have a more specific collation available. But the code in contrib/pg_trgm should really depend on the collation of the columns being processed. This is not done here, this can be done in a separate patch. --- contrib/dict_xsyn/dict_xsyn.c | 6 +- contrib/pg_trgm/trgm_op.c | 6 +- contrib/pg_trgm/trgm_regexp.c | 8 ++- src/backend/snowball/dict_snowball.c | 8 ++- src/backend/tsearch/dict_ispell.c | 7 ++- src/backend/tsearch/dict_simple.c | 7 ++- src/backend/tsearch/dict_synonym.c | 8 ++- src/backend/tsearch/spell.c | 7 ++- src/backend/tsearch/ts_locale.c | 89 ---------------------------- src/backend/tsearch/ts_utils.c | 5 +- src/include/tsearch/ts_locale.h | 3 - src/include/tsearch/ts_public.h | 2 +- 12 files changed, 39 insertions(+), 117 deletions(-) diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index f8c0a5bf5c5..2206300f7b5 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -14,9 +14,11 @@ #include +#include "catalog/pg_collation_d.h" #include "commands/defrem.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" +#include "utils/formatting.h" PG_MODULE_MAGIC; @@ -93,7 +95,7 @@ read_dictionary(DictSyn *d, const char *filename) if (*line == '\0') continue; - value = lowerstr(line); + value = str_tolower(line, strlen(line), DEFAULT_COLLATION_OID); pfree(line); pos = value; @@ -210,7 +212,7 @@ dxsyn_lexize(PG_FUNCTION_ARGS) { char *temp = pnstrdup(in, length); - word.key = lowerstr(temp); + word.key = str_tolower(temp, length, DEFAULT_COLLATION_OID); pfree(temp); word.value = NULL; } diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index c509d15ee40..d0833b3e4a1 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -5,12 +5,14 @@ #include +#include "catalog/pg_collation_d.h" #include "catalog/pg_type.h" #include "common/int.h" #include "lib/qunique.h" #include "miscadmin.h" #include "trgm.h" #include "tsearch/ts_locale.h" +#include "utils/formatting.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -303,7 +305,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL) { #ifdef IGNORECASE - bword = lowerstr_with_len(bword, eword - bword); + bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID); bytelen = strlen(bword); #else bytelen = eword - bword; @@ -899,7 +901,7 @@ generate_wildcard_trgm(const char *str, int slen) buf, &bytelen, &charlen)) != NULL) { #ifdef IGNORECASE - buf2 = lowerstr_with_len(buf, bytelen); + buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID); bytelen = strlen(buf2); #else buf2 = buf; diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 75d6d1d4a8d..457b21b8302 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -191,9 +191,11 @@ */ #include "postgres.h" +#include "catalog/pg_collation_d.h" #include "regex/regexport.h" #include "trgm.h" #include "tsearch/ts_locale.h" +#include "utils/formatting.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "varatt.h" @@ -847,16 +849,16 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) * within each color, since we used the REG_ICASE option; so there's no * need to process the uppercase version. * - * XXX this code is dependent on the assumption that lowerstr() works the + * XXX this code is dependent on the assumption that str_tolower() works the * same as the regex engine's internal case folding machinery. Might be * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c). * On the other hand, the trigrams in the index were created using - * lowerstr(), so we're probably screwed if there's any incompatibility + * str_tolower(), so we're probably screwed if there's any incompatibility * anyway. */ #ifdef IGNORECASE { - char *lowerCased = lowerstr(s); + char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID); if (strcmp(lowerCased, s) != 0) { diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c index caf86490683..12f7485bcde 100644 --- a/src/backend/snowball/dict_snowball.c +++ b/src/backend/snowball/dict_snowball.c @@ -12,9 +12,11 @@ */ #include "postgres.h" +#include "catalog/pg_collation_d.h" #include "commands/defrem.h" -#include "tsearch/ts_locale.h" +#include "mb/pg_wchar.h" #include "tsearch/ts_public.h" +#include "utils/formatting.h" /* Some platforms define MAXINT and/or MININT, causing conflicts */ #ifdef MAXINT @@ -236,7 +238,7 @@ dsnowball_init(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); - readstoplist(defGetString(defel), &d->stoplist, lowerstr); + readstoplist(defGetString(defel), &d->stoplist, str_tolower); stoploaded = true; } else if (strcmp(defel->defname, "language") == 0) @@ -272,7 +274,7 @@ dsnowball_lexize(PG_FUNCTION_ARGS) DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); - char *txt = lowerstr_with_len(in, len); + char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID); TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); /* diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c index 07b9ad794de..8772c95038f 100644 --- a/src/backend/tsearch/dict_ispell.c +++ b/src/backend/tsearch/dict_ispell.c @@ -13,11 +13,12 @@ */ #include "postgres.h" +#include "catalog/pg_collation_d.h" #include "commands/defrem.h" #include "tsearch/dicts/spell.h" -#include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "utils/fmgrprotos.h" +#include "utils/formatting.h" typedef struct @@ -72,7 +73,7 @@ dispell_init(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); - readstoplist(defGetString(defel), &(d->stoplist), lowerstr); + readstoplist(defGetString(defel), &(d->stoplist), str_tolower); stoploaded = true; } else @@ -121,7 +122,7 @@ dispell_lexize(PG_FUNCTION_ARGS) if (len <= 0) PG_RETURN_POINTER(NULL); - txt = lowerstr_with_len(in, len); + txt = str_tolower(in, len, DEFAULT_COLLATION_OID); res = NINormalizeWord(&(d->obj), txt); if (res == NULL) diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c index b0c9fd7946f..b914875dd96 100644 --- a/src/backend/tsearch/dict_simple.c +++ b/src/backend/tsearch/dict_simple.c @@ -13,10 +13,11 @@ */ #include "postgres.h" +#include "catalog/pg_collation_d.h" #include "commands/defrem.h" -#include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "utils/fmgrprotos.h" +#include "utils/formatting.h" typedef struct @@ -47,7 +48,7 @@ dsimple_init(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); - readstoplist(defGetString(defel), &d->stoplist, lowerstr); + readstoplist(defGetString(defel), &d->stoplist, str_tolower); stoploaded = true; } else if (strcmp(defel->defname, "accept") == 0) @@ -80,7 +81,7 @@ dsimple_lexize(PG_FUNCTION_ARGS) char *txt; TSLexeme *res; - txt = lowerstr_with_len(in, len); + txt = str_tolower(in, len, DEFAULT_COLLATION_OID); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) { diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index 77c0d7a3593..70adbba546c 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -13,10 +13,12 @@ */ #include "postgres.h" +#include "catalog/pg_collation_d.h" #include "commands/defrem.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "utils/fmgrprotos.h" +#include "utils/formatting.h" typedef struct { @@ -183,8 +185,8 @@ dsynonym_init(PG_FUNCTION_ARGS) } else { - d->syn[cur].in = lowerstr(starti); - d->syn[cur].out = lowerstr(starto); + d->syn[cur].in = str_tolower(starti, strlen(starti), DEFAULT_COLLATION_OID); + d->syn[cur].out = str_tolower(starto, strlen(starto), DEFAULT_COLLATION_OID); } d->syn[cur].outlen = strlen(starto); @@ -223,7 +225,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS) if (d->case_sensitive) key.in = pnstrdup(in, len); else - key.in = lowerstr_with_len(in, len); + key.in = str_tolower(in, len, DEFAULT_COLLATION_OID); key.out = NULL; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index 7eca1714e9b..fcbda395946 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -66,6 +66,7 @@ #include "miscadmin.h" #include "tsearch/dicts/spell.h" #include "tsearch/ts_locale.h" +#include "utils/formatting.h" #include "utils/memutils.h" @@ -169,7 +170,7 @@ cpstrdup(IspellDict *Conf, const char *str) /* - * Apply lowerstr(), producing a temporary result (in the buildCxt). + * Apply str_tolower(), producing a temporary result (in the buildCxt). */ static char * lowerstr_ctx(IspellDict *Conf, const char *src) @@ -178,7 +179,7 @@ lowerstr_ctx(IspellDict *Conf, const char *src) char *dst; saveCtx = MemoryContextSwitchTo(Conf->buildCxt); - dst = lowerstr(src); + dst = str_tolower(src, strlen(src), DEFAULT_COLLATION_OID); MemoryContextSwitchTo(saveCtx); return dst; @@ -1449,7 +1450,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename) while ((recoded = tsearch_readline(&trst)) != NULL) { - pstr = lowerstr(recoded); + pstr = str_tolower(recoded, strlen(recoded), DEFAULT_COLLATION_OID); /* Skip comments and empty lines */ if (*pstr == '#' || *pstr == '\n') diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index a61fd36022e..b2aefa31c26 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -197,92 +197,3 @@ tsearch_readline_callback(void *arg) stp->lineno, stp->filename); } - - -/* - * lowerstr --- fold null-terminated string to lower case - * - * Returned string is palloc'd - */ -char * -lowerstr(const char *str) -{ - return lowerstr_with_len(str, strlen(str)); -} - -/* - * lowerstr_with_len --- fold string to lower case - * - * Input string need not be null-terminated. - * - * Returned string is palloc'd - */ -char * -lowerstr_with_len(const char *str, int len) -{ - char *out; - pg_locale_t mylocale = 0; /* TODO */ - - if (len == 0) - return pstrdup(""); - - /* - * Use wide char code only when max encoding length > 1 and ctype != C. - * Some operating systems fail with multi-byte encodings and a C locale. - * Also, for a C locale there is no need to process as multibyte. From - * backend/utils/adt/oracle_compat.c Teodor - */ - if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c) - { - wchar_t *wstr, - *wptr; - int wlen; - - /* - * alloc number of wchar_t for worst case, len contains number of - * bytes >= number of characters and alloc 1 wchar_t for 0, because - * wchar2char wants zero-terminated string - */ - wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); - - wlen = char2wchar(wstr, len + 1, str, len, mylocale); - Assert(wlen <= len); - - while (*wptr) - { - *wptr = towlower((wint_t) *wptr); - wptr++; - } - - /* - * Alloc result string for worst case + '\0' - */ - len = pg_database_encoding_max_length() * wlen + 1; - out = (char *) palloc(len); - - wlen = wchar2char(out, wstr, len, mylocale); - - pfree(wstr); - - if (wlen < 0) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("conversion from wchar_t to server encoding failed: %m"))); - Assert(wlen < len); - } - else - { - const char *ptr = str; - char *outptr; - - outptr = out = (char *) palloc(sizeof(char) * (len + 1)); - while ((ptr - str) < len && *ptr) - { - *outptr++ = tolower(TOUCHAR(ptr)); - ptr++; - } - *outptr = '\0'; - } - - return out; -} diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index f20e61d4c8c..89d5ce4ca85 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -16,6 +16,7 @@ #include +#include "catalog/pg_collation_d.h" #include "miscadmin.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" @@ -65,7 +66,7 @@ get_tsearch_config_filename(const char *basename, * or palloc a new version. */ void -readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) +readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *, size_t, Oid)) { char **stop = NULL; @@ -115,7 +116,7 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) if (wordop) { - stop[s->len] = wordop(line); + stop[s->len] = wordop(line, strlen(line), DEFAULT_COLLATION_OID); if (stop[s->len] != line) pfree(line); } diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index 71e1f78fa36..38b1a1ba90e 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -42,9 +42,6 @@ typedef struct extern int t_isalpha(const char *ptr); extern int t_isalnum(const char *ptr); -extern char *lowerstr(const char *str); -extern char *lowerstr_with_len(const char *str, int len); - extern bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename); extern char *tsearch_readline(tsearch_readline_state *stp); diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index e1549863a12..959bbcc00af 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -104,7 +104,7 @@ typedef struct } StopList; extern void readstoplist(const char *fname, StopList *s, - char *(*wordop) (const char *)); + char *(*wordop) (const char *, size_t, Oid)); extern bool searchstoplist(StopList *s, char *key); /* -- 2.47.1