From b0d42407a60e116d3ccb0ed04505aa362f8a6a1d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 14 Dec 2022 10:15:03 +0100 Subject: [PATCH] Allow tailoring of ICU locales with custom rules This exposes the ICU facility to add custom collation rules to a standard collation. --- doc/src/sgml/catalogs.sgml | 18 +++++++ doc/src/sgml/ref/create_collation.sgml | 22 +++++++++ doc/src/sgml/ref/create_database.sgml | 12 +++++ src/backend/catalog/pg_collation.c | 5 ++ src/backend/commands/collationcmds.c | 21 ++++++-- src/backend/commands/dbcommands.c | 49 +++++++++++++++++-- src/backend/utils/adt/pg_locale.c | 41 +++++++++++++++- src/backend/utils/init/postinit.c | 11 ++++- src/include/catalog/pg_collation.h | 2 + src/include/catalog/pg_database.h | 3 ++ src/include/utils/pg_locale.h | 1 + .../regress/expected/collate.icu.utf8.out | 30 ++++++++++++ src/test/regress/sql/collate.icu.utf8.sql | 13 +++++ 13 files changed, 219 insertions(+), 9 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 9316b811ac..afa9f28ef9 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2428,6 +2428,15 @@ <structname>pg_collation</structname> Columns + + + collicurules text + + + ICU collation rules for this collation object + + + collversion text @@ -3106,6 +3115,15 @@ <structname>pg_database</structname> Columns + + + daticurules text + + + ICU collation rules for this database + + + datcollversion text diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 58f5f0cd63..2c7266107e 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -27,6 +27,7 @@ [ LC_CTYPE = lc_ctype, ] [ PROVIDER = provider, ] [ DETERMINISTIC = boolean, ] + [ RULES = rules, ] [ VERSION = version ] ) CREATE COLLATION [ IF NOT EXISTS ] name FROM existing_collation @@ -149,6 +150,19 @@ Parameters + + rules + + + + Specifies additional collation rules to customize the behavior of the + collation. This is supported for ICU only. See + for details on the syntax. + + + + version @@ -228,6 +242,14 @@ Examples + + To create a collation using the ICU provider, based on the English ICU + locale, with custom rules: + + + + + To create a collation from an existing collation: diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index ea38c64731..aa6f121a81 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -192,6 +192,18 @@ Parameters + + icu_rules + + + Specifies additional collation rules to customize the behavior of the + collation. This is supported for ICU only. See + for details on the syntax. + + + + locale_provider diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index aa8fbe1a98..7ed9de3891 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -50,6 +50,7 @@ CollationCreate(const char *collname, Oid collnamespace, int32 collencoding, const char *collcollate, const char *collctype, const char *colliculocale, + const char *collicurules, const char *collversion, bool if_not_exists, bool quiet) @@ -194,6 +195,10 @@ CollationCreate(const char *collname, Oid collnamespace, values[Anum_pg_collation_colliculocale - 1] = CStringGetTextDatum(colliculocale); else nulls[Anum_pg_collation_colliculocale - 1] = true; + if (collicurules) + values[Anum_pg_collation_collicurules - 1] = CStringGetTextDatum(collicurules); + else + nulls[Anum_pg_collation_collicurules - 1] = true; if (collversion) values[Anum_pg_collation_collversion - 1] = CStringGetTextDatum(collversion); else diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 81e54e0ce6..50f16f2764 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -64,10 +64,12 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e DefElem *lcctypeEl = NULL; DefElem *providerEl = NULL; DefElem *deterministicEl = NULL; + DefElem *rulesEl = NULL; DefElem *versionEl = NULL; char *collcollate; char *collctype; char *colliculocale; + char *collicurules; bool collisdeterministic; int collencoding; char collprovider; @@ -99,6 +101,8 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e defelp = &providerEl; else if (strcmp(defel->defname, "deterministic") == 0) defelp = &deterministicEl; + else if (strcmp(defel->defname, "rules") == 0) + defelp = &rulesEl; else if (strcmp(defel->defname, "version") == 0) defelp = &versionEl; else @@ -161,6 +165,12 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e else colliculocale = NULL; + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); + if (!isnull) + collicurules = TextDatumGetCString(datum); + else + collicurules = NULL; + ReleaseSysCache(tp); /* @@ -182,6 +192,7 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e collcollate = NULL; collctype = NULL; colliculocale = NULL; + collicurules = NULL; if (providerEl) collproviderstr = defGetString(providerEl); @@ -191,6 +202,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e else collisdeterministic = true; + if (rulesEl) + collicurules = defGetString(rulesEl); + if (versionEl) collversion = defGetString(versionEl); @@ -297,6 +311,7 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e collcollate, collctype, colliculocale, + collicurules, collversion, if_not_exists, false); /* not quiet */ @@ -710,7 +725,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) */ collid = CollationCreate(localebuf, nspid, GetUserId(), COLLPROVIDER_LIBC, true, enc, - localebuf, localebuf, NULL, + localebuf, localebuf, NULL, NULL, get_collation_actual_version(COLLPROVIDER_LIBC, localebuf), true, true); if (OidIsValid(collid)) @@ -777,7 +792,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) collid = CollationCreate(alias, nspid, GetUserId(), COLLPROVIDER_LIBC, true, enc, - locale, locale, NULL, + locale, locale, NULL, NULL, get_collation_actual_version(COLLPROVIDER_LIBC, locale), true, true); if (OidIsValid(collid)) @@ -839,7 +854,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) collid = CollationCreate(psprintf("%s-x-icu", langtag), nspid, GetUserId(), COLLPROVIDER_ICU, true, -1, - NULL, NULL, iculocstr, + NULL, NULL, iculocstr, NULL, get_collation_actual_version(COLLPROVIDER_ICU, iculocstr), true, true); if (OidIsValid(collid)) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 6eb8742718..65d6ad46ef 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -119,6 +119,7 @@ static bool get_db_info(const char *name, LOCKMODE lockmode, int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP, TransactionId *dbFrozenXidP, MultiXactId *dbMinMultiP, Oid *dbTablespace, char **dbCollate, char **dbCtype, char **dbIculocale, + char **dbIcurules, char *dbLocProvider, char **dbCollversion); static bool have_createdb_privilege(void); @@ -676,6 +677,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) char *src_collate = NULL; char *src_ctype = NULL; char *src_iculocale = NULL; + char *src_icurules = NULL; char src_locprovider = '\0'; char *src_collversion = NULL; bool src_istemplate; @@ -699,6 +701,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) DefElem *dcollate = NULL; DefElem *dctype = NULL; DefElem *diculocale = NULL; + DefElem *dicurules = NULL; DefElem *dlocprovider = NULL; DefElem *distemplate = NULL; DefElem *dallowconnections = NULL; @@ -711,6 +714,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) char *dbcollate = NULL; char *dbctype = NULL; char *dbiculocale = NULL; + char *dbicurules = NULL; char dblocprovider = '\0'; char *canonname; int encoding = -1; @@ -776,6 +780,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errorConflictingDefElem(defel, pstate); diculocale = defel; } + else if (strcmp(defel->defname, "icu_rules") == 0) + { + if (dicurules) + errorConflictingDefElem(defel, pstate); + dicurules = defel; + } else if (strcmp(defel->defname, "locale_provider") == 0) { if (dlocprovider) @@ -959,7 +969,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) &src_dboid, &src_owner, &src_encoding, &src_istemplate, &src_allowconn, &src_frozenxid, &src_minmxid, &src_deftablespace, - &src_collate, &src_ctype, &src_iculocale, &src_locprovider, + &src_collate, &src_ctype, &src_iculocale, &src_icurules, &src_locprovider, &src_collversion)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), @@ -1007,6 +1017,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dblocprovider = src_locprovider; if (dbiculocale == NULL && dblocprovider == COLLPROVIDER_ICU) dbiculocale = src_iculocale; + if (dbicurules == NULL && dblocprovider == COLLPROVIDER_ICU) + dbicurules = src_icurules; /* Some encodings are client only */ if (!PG_VALID_BE_ENCODING(encoding)) @@ -1098,6 +1110,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (dblocprovider == COLLPROVIDER_ICU) { + char *val1; + char *val2; + Assert(dbiculocale); Assert(src_iculocale); if (strcmp(dbiculocale, src_iculocale) != 0) @@ -1106,6 +1121,19 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("new ICU locale (%s) is incompatible with the ICU locale of the template database (%s)", dbiculocale, src_iculocale), errhint("Use the same ICU locale as in the template database, or use template0 as template."))); + + val1 = dbicurules; + if (!val1) + val1 = ""; + val2 = src_icurules; + if (!val2) + val2 = ""; + if (strcmp(val1, val2) != 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new ICU collation rules (%s) are incompatible with the ICU collation rules of the template database (%s)", + val1, val2), + errhint("Use the same ICU collation rules as in the template database, or use template0 as template."))); } } @@ -1314,6 +1342,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) new_record[Anum_pg_database_daticulocale - 1] = CStringGetTextDatum(dbiculocale); else new_record_nulls[Anum_pg_database_daticulocale - 1] = true; + if (dbicurules) + new_record[Anum_pg_database_daticurules - 1] = CStringGetTextDatum(dbicurules); + else + new_record_nulls[Anum_pg_database_daticurules - 1] = true; if (dbcollversion) new_record[Anum_pg_database_datcollversion - 1] = CStringGetTextDatum(dbcollversion); else @@ -1527,7 +1559,7 @@ dropdb(const char *dbname, bool missing_ok, bool force) pgdbrel = table_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, - &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) { if (!missing_ok) { @@ -1727,7 +1759,7 @@ RenameDatabase(const char *oldname, const char *newname) rel = table_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", oldname))); @@ -1837,7 +1869,7 @@ movedb(const char *dbname, const char *tblspcname) pgdbrel = table_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, - NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL, NULL, NULL, NULL)) + NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL, NULL, NULL, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); @@ -2600,6 +2632,7 @@ get_db_info(const char *name, LOCKMODE lockmode, int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP, TransactionId *dbFrozenXidP, MultiXactId *dbMinMultiP, Oid *dbTablespace, char **dbCollate, char **dbCtype, char **dbIculocale, + char **dbIcurules, char *dbLocProvider, char **dbCollversion) { @@ -2716,6 +2749,14 @@ get_db_info(const char *name, LOCKMODE lockmode, else *dbIculocale = TextDatumGetCString(datum); } + if (dbIcurules) + { + datum = SysCacheGetAttr(DATABASEOID, tuple, Anum_pg_database_daticurules, &isnull); + if (isnull) + *dbIcurules = NULL; + else + *dbIcurules = TextDatumGetCString(datum); + } if (dbCollversion) { datum = SysCacheGetAttr(DATABASEOID, tuple, Anum_pg_database_datcollversion, &isnull); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 2b42d9ccd8..191d2e8a68 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -69,6 +69,7 @@ #ifdef USE_ICU #include +#include #endif #ifdef __GLIBC__ @@ -1402,6 +1403,7 @@ struct pg_locale_struct default_locale; void make_icu_collator(const char *iculocstr, + const char *icurules, struct pg_locale_struct *resultp) { #ifdef USE_ICU @@ -1418,6 +1420,35 @@ make_icu_collator(const char *iculocstr, if (U_ICU_VERSION_MAJOR_NUM < 54) icu_set_collation_attributes(collator, iculocstr); + /* + * If rules are specified, we extract the rules of the standard collation, + * add our own rules, and make a new collator with the combined rules. + */ + if (icurules) + { + const UChar *default_rules; + UChar *agg_rules; + UChar *my_rules; + int32_t length; + + default_rules = ucol_getRules(collator, &length); + icu_to_uchar(&my_rules, icurules, strlen(icurules)); + + agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1); + u_strcpy(agg_rules, default_rules); + u_strcat(agg_rules, my_rules); + + ucol_close(collator); + + status = U_ZERO_ERROR; + collator = ucol_openRules(agg_rules, u_strlen(agg_rules), + UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s", + iculocstr, icurules, u_errorName(status)))); + } + /* We will leak this string if the caller errors later :-( */ resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); resultp->info.icu.ucol = collator; @@ -1580,11 +1611,19 @@ pg_newlocale_from_collation(Oid collid) else if (collform->collprovider == COLLPROVIDER_ICU) { const char *iculocstr; + const char *icurules; datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_colliculocale, &isnull); Assert(!isnull); iculocstr = TextDatumGetCString(datum); - make_icu_collator(iculocstr, &result); + + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); + if (!isnull) + icurules = TextDatumGetCString(datum); + else + icurules = NULL; + + make_icu_collator(iculocstr, icurules, &result); } datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index a990c833c5..55a035c062 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -421,10 +421,19 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect if (dbform->datlocprovider == COLLPROVIDER_ICU) { + char *icurules; + datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticulocale, &isnull); Assert(!isnull); iculocale = TextDatumGetCString(datum); - make_icu_collator(iculocale, &default_locale); + + datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull); + if (!isnull) + icurules = TextDatumGetCString(datum); + else + icurules = NULL; + + make_icu_collator(iculocale, icurules, &default_locale); } else iculocale = NULL; diff --git a/src/include/catalog/pg_collation.h b/src/include/catalog/pg_collation.h index 2190ccb5b8..ad2d767d65 100644 --- a/src/include/catalog/pg_collation.h +++ b/src/include/catalog/pg_collation.h @@ -43,6 +43,7 @@ CATALOG(pg_collation,3456,CollationRelationId) text collcollate BKI_DEFAULT(_null_); /* LC_COLLATE setting */ text collctype BKI_DEFAULT(_null_); /* LC_CTYPE setting */ text colliculocale BKI_DEFAULT(_null_); /* ICU locale ID */ + text collicurules BKI_DEFAULT(_null_); /* ICU collation rules */ text collversion BKI_DEFAULT(_null_); /* provider-dependent * version of collation * data */ @@ -91,6 +92,7 @@ extern Oid CollationCreate(const char *collname, Oid collnamespace, int32 collencoding, const char *collcollate, const char *collctype, const char *colliculocale, + const char *collicurules, const char *collversion, bool if_not_exists, bool quiet); diff --git a/src/include/catalog/pg_database.h b/src/include/catalog/pg_database.h index 611c95656a..48bc285863 100644 --- a/src/include/catalog/pg_database.h +++ b/src/include/catalog/pg_database.h @@ -71,6 +71,9 @@ CATALOG(pg_database,1262,DatabaseRelationId) BKI_SHARED_RELATION BKI_ROWTYPE_OID /* ICU locale ID */ text daticulocale; + /* ICU collation rules */ + text daticurules BKI_DEFAULT(_null_); + /* provider-dependent version of collation data */ text datcollversion BKI_DEFAULT(_null_); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index a875942123..989fe26cef 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -95,6 +95,7 @@ typedef struct pg_locale_struct *pg_locale_t; extern PGDLLIMPORT struct pg_locale_struct default_locale; extern void make_icu_collator(const char *iculocstr, + const char *icurules, struct pg_locale_struct *resultp); extern pg_locale_t pg_newlocale_from_collation(Oid collid); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index d4c8c6de38..ce9ed3c8bf 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1190,6 +1190,36 @@ SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE tes t | t (1 row) +-- rules +CREATE COLLATION testcoll_rules1 (provider = icu, locale = '', rules = '&a < g'); +CREATE TABLE test7 (a text); +-- example from https://unicode-org.github.io/icu/userguide/collation/customization/#syntax +INSERT INTO test7 VALUES ('Abernathy'), ('apple'), ('bird'), ('Boston'), ('Graham'), ('green'); +SELECT * FROM test7 ORDER BY a COLLATE "en-x-icu"; + a +----------- + Abernathy + apple + bird + Boston + Graham + green +(6 rows) + +SELECT * FROM test7 ORDER BY a COLLATE testcoll_rules1; + a +----------- + Abernathy + apple + green + bird + Boston + Graham +(6 rows) + +DROP TABLE test7; +CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!!'); +ERROR: could not open collator for locale "" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR -- nondeterministic collations CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true); CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false); diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index b0ddc7db44..aa95c1ec42 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -472,6 +472,19 @@ CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=p SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook; +-- rules + +CREATE COLLATION testcoll_rules1 (provider = icu, locale = '', rules = '&a < g'); +CREATE TABLE test7 (a text); +-- example from https://unicode-org.github.io/icu/userguide/collation/customization/#syntax +INSERT INTO test7 VALUES ('Abernathy'), ('apple'), ('bird'), ('Boston'), ('Graham'), ('green'); +SELECT * FROM test7 ORDER BY a COLLATE "en-x-icu"; +SELECT * FROM test7 ORDER BY a COLLATE testcoll_rules1; +DROP TABLE test7; + +CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!!'); + + -- nondeterministic collations CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true); -- 2.38.1