From 5eac8a0df7163f8374382d37b32b9c2d3580238d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 22 Sep 2017 13:51:01 -0400 Subject: [PATCH 1/2] Expand collation documentation Document better how to create custom collations and what locale strings ICU accepts. Explain the ICU examples in more detail. Also update the text on the CREATE COLLATION reference page a bit to take ICU more into account. --- doc/src/sgml/charset.sgml | 135 ++++++++++++++++++++++++++------- doc/src/sgml/ref/create_collation.sgml | 28 ++++--- 2 files changed, 124 insertions(+), 39 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 44e43503a6..63f7de5b43 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -515,7 +515,7 @@ Managing Collations A collation object provided by libc maps to a combination of LC_COLLATE and LC_CTYPE - settings. (As + settings, as accepted by the setlocale() system library call. (As the name would suggest, the main purpose of a collation is to set LC_COLLATE, which controls the sort order. But it is rarely necessary in practice to have an @@ -640,21 +640,19 @@ libc collations ICU collations - Collations provided by ICU are created with names in BCP 47 language tag + With ICU, it is not sensible to enumerate all possible locale names. ICU + uses a particular naming system for locales, but there are many more ways + to name a locale than there are actually distinct locales. + initdb uses the ICU APIs to extract a set of distinct + locales to populate the initial set of collations. Collations provided by + ICU are created in the SQL environment with names in BCP 47 language tag format, with a private use extension -x-icu appended, to distinguish them from - libc locales. So de-x-icu would be an example name. + libc locales. - With ICU, it is not sensible to enumerate all possible locale names. ICU - uses a particular naming system for locales, but there are many more ways - to name a locale than there are actually distinct locales. (In fact, any - string will be accepted as a locale name.) - See for - information on ICU locale naming. initdb uses the ICU - APIs to extract a set of distinct locales to populate the initial set of - collations. Here are some example collations that might be created: + Here are some example collations that might be created: @@ -695,32 +693,104 @@ ICU collations will draw an error along the lines of collation "de-x-icu" for encoding "WIN874" does not exist. + + + + + Creating New Collation Objects + + + If the standard and predefined collations are not sufficient, users can + create their own collation objects using the SQL + command . + + + + The standard and predefined collations are in the + schema pg_catalog, like all predefined objects. + User-defined collations should be created in user schemas. This also + ensures that they are saved by pg_dump. + + + + libc collations + + + New libc collations can be created like this: + +CREATE COLLATION german (provider = libc, locale = 'de_DE'); + + The exact values that are acceptable for the locale + clause in this command depend on the operating system. On Unix-like + systems, the command locale -a will show a list. + + + + Since the predefined libc collations already include all collations + defined in the operating system when the database instance is + initialized, it is not often necessary to manually create new ones. + Reasons might be if a different naming system is desired (in which case + see also ) or if the operating system has + been upgraded to provide new locale definitions (in which case see + also pg_import_system_collations()). + + + + + ICU collations ICU allows collations to be customized beyond the basic language+country set that is preloaded by initdb. Users are encouraged to define their own collation objects that make use of these facilities to - suit the sorting behavior to their requirements. Here are some examples: + suit the sorting behavior to their requirements. + See + and for + information on ICU locale naming. The set of acceptable names and + attributes depends on the particular ICU version. + + + + Here are some examples: - CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk') + CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk'); + CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de@collation=phonebook'); German collation with phone book collation type + + The first example selects the ICU locale using a language + tag per BCP 47. The second example uses the traditional + ICU-specific locale syntax. The first style is preferred going + forward, but it is not supported by older ICU versions. + + + Note that you can name the collation objects in the SQL environment + anything you want. In this example, we follow the naming style that + the predefined collations use, which in turn also follow BCP 47, but + that is not required for user-defined collations. + - CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji') + CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji'); + CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = '@collation=emoji'); Root collation with Emoji collation type, per Unicode Technical Standard #51 + + Observe how in the traditional ICU locale naming system, the root + locale is selected by an empty string. + - CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit') + CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit'); + CREATE COLLATION digitslast (provider = icu, locale = 'en@colReorder=latn-digit'); Sort digits after Latin letters. (The default is digits before letters.) @@ -729,7 +799,8 @@ ICU collations - CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper') + CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper'); + CREATE COLLATION upperfirst (provider = icu, locale = 'en@colCaseFirst=upper'); Sort upper-case letters before lower-case letters. (The default is @@ -739,7 +810,8 @@ ICU collations - CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit') + CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit'); + CREATE COLLATION special (provider = icu, locale = 'en@colCaseFirst=upper;colReorder=latn-digit'); Combines both of the above options. @@ -748,7 +820,8 @@ ICU collations - CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true') + CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true'); + CREATE COLLATION numeric (provider = icu, locale = 'en@colNumeric=yes'); Numeric ordering, sorts sequences of digits by their numeric value, @@ -768,7 +841,8 @@ ICU collations repository. The ICU Locale Explorer can be used to check the details of a particular locale - definition. + definition. The examples using the k* subtags require + at least ICU version 54. @@ -779,10 +853,21 @@ ICU collations strings that compare equal according to the collation but are not byte-wise equal will be sorted according to their byte values. + + + + By design, ICU will accept almost any string as a locale name and match + it to the closet locale it can provide, using the fallback procedure + described in its documentation. Thus, there will be no direct feedback + if a collation specification is composed using features that the given + ICU installation does not actually support. It is therefore recommended + to create application-level test cases to check that the collation + definitions satisfy one's requirements. + + - - + Copying Collations @@ -796,13 +881,7 @@ Copying Collations CREATE COLLATION french FROM "fr-x-icu"; - - - The standard and predefined collations are in the - schema pg_catalog, like all predefined objects. - User-defined collations should be created in user schemas. This also - ensures that they are saved by pg_dump. - + diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 2d3e050545..f88758095f 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -93,10 +93,7 @@ Parameters Use the specified operating system locale for - the LC_COLLATE locale category. The locale - must be applicable to the current database encoding. - (See for the precise - rules.) + the LC_COLLATE locale category. @@ -107,10 +104,7 @@ Parameters Use the specified operating system locale for - the LC_CTYPE locale category. The locale - must be applicable to the current database encoding. - (See for the precise - rules.) + the LC_CTYPE locale category. @@ -173,8 +167,13 @@ Notes - See for more information about collation - support in PostgreSQL. + See for more information on how to create collations. + + + + When using the libc collation provider, the locale must + be applicable to the current database encoding. + See for the precise rules. @@ -186,7 +185,14 @@ Examples fr_FR.utf8 (assuming the current database encoding is UTF8): -CREATE COLLATION french (LOCALE = 'fr_FR.utf8'); +CREATE COLLATION french (locale = 'fr_FR.utf8'); + + + + + To create a collation using the ICU provider using German phone book sort order: + +CREATE COLLATION german_phonebook (provider = icu, locale = 'de-u-co-phonebk'); -- 2.14.1