Index: doc/src/sgml/syntax.sgml
===================================================================
RCS file: /cvsroot/pgsql/doc/src/sgml/syntax.sgml,v
retrieving revision 1.123
diff -u -3 -p -c -r1.123 syntax.sgml
*** doc/src/sgml/syntax.sgml 26 Jun 2008 22:24:42 -0000 1.123
--- doc/src/sgml/syntax.sgml 27 Oct 2008 16:54:26 -0000
*************** UPDATE "my_table" SET "a" = 5;
*** 190,195 ****
--- 190,247 ----
+ A variant of quoted identifiers allows including escaped Unicode
+ characters identified by their code points. This variant starts
+ with U& (upper or lower case U followed by
+ ampersand) immediately before the opening double quote, without
+ any spaces in between, for example U&"foo".
+ (Note that this creates an ambiguity with the
+ operator &. Use spaces around the operator to
+ avoid this problem.) Inside the quotes, Unicode characters can be
+ specified in escaped form by writing a backslash followed by the
+ four-digit hexadecimal code point number or alternatively a
+ backslash followed by a plus sign followed by a six-digt
+ hexadecimal code point number. For example, the
+ identifier "data" could be written as
+
+ U&"d\0061t\0061"
+
+ or equivalently
+
+ U&"d\+000061t\+000061"
+
+ The following less trivial example writes the Russian
+ word slon
(elephant) in Cyrillic letters:
+
+ U&"\0441\043B\043E\043D"
+
+
+
+
+ If a different escape character than backslash is desired, it can
+ be specified using the UESCAPE clause after the
+ string, for example:
+
+ U&"d!0061t!0061" UESCAPE '!'
+
+ The escape character can be any single character other than a
+ hexadecimal digit, the plus sign, a single quote, a double quote,
+ or a whitespace character. Note that the escape character is
+ written in single quotes, not double quotes.
+
+
+
+ To include the escape character in the identifier literally, write
+ it twice.
+
+
+
+ The Unicode escape syntax works only when the server encoding is
+ UTF8. When other server encodings are used, only code points in
+ the ASCII range (up to \007F) can be specified.
+
+
+
Quoting an identifier also makes it case-sensitive, whereas
unquoted names are always folded to lower case. For example, the
identifiers FOO, foo, and
*************** UPDATE "my_table" SET "a" = 5;
*** 245,251 ****
write two adjacent single quotes, e.g.
'Dianne''s horse'.
Note that this is not> the same as a double-quote
! character (">).
--- 297,303 ----
write two adjacent single quotes, e.g.
'Dianne''s horse'.
Note that this is not> the same as a double-quote
! character (">).
*************** SELECT 'foo' 'bar';
*** 269,282 ****
by SQL; PostgreSQL is
following the standard.)
-
escape string syntax
backslash escapes
PostgreSQL also accepts escape>
string constants, which are an extension to the SQL standard.
An escape string constant is specified by writing the letter
--- 321,339 ----
by SQL; PostgreSQL is
following the standard.)
+
+
+
+ String Constants with C-Style Escapes
escape string syntax
backslash escapes
+
+
PostgreSQL also accepts escape>
string constants, which are an extension to the SQL standard.
An escape string constant is specified by writing the letter
*************** SELECT 'foo' 'bar';
*** 287,293 ****
Within an escape string, a backslash character (\>) begins a
C-like backslash escape> sequence, in which the combination
of backslash and following character(s) represent a special byte
! value:
Backslash Escape Sequences
--- 344,351 ----
Within an escape string, a backslash character (\>) begins a
C-like backslash escape> sequence, in which the combination
of backslash and following character(s) represent a special byte
! value, shown in
!
Backslash Escape Sequences
*************** SELECT 'foo' 'bar';
*** 341,354 ****
! It is your responsibility that the byte sequences you create are
! valid characters in the server character set encoding. Any other
character following a backslash is taken literally. Thus, to
include a backslash character, write two backslashes (\\>).
Also, a single quote can be included in an escape string by writing
\', in addition to the normal way of ''>.
If the configuration parameter
--- 399,422 ----
!
! Any other
character following a backslash is taken literally. Thus, to
include a backslash character, write two backslashes (\\>).
Also, a single quote can be included in an escape string by writing
\', in addition to the normal way of ''>.
+
+ It is your responsibility that the byte sequences you create are
+ valid characters in the server character set encoding. When the
+ server encoding is UTF-8, then the alternative Unicode escape
+ syntax, explained in ,
+ should be used instead. (The alternative would be doing the
+ UTF-8 encoding by hand and writing out the bytes, which would be
+ very cumbersome.)
+
+
If the configuration parameter
*************** SELECT 'foo' 'bar';
*** 379,384 ****
--- 447,509 ----
+
+ String Constants with Unicode Escapes
+
+
+ PostgreSQL also supports another type
+ of escape syntax for strings that allows specifying arbitrary
+ Unicode characters by code point. A Unicode escape string
+ constant starts with U& (upper or lower case
+ letter U followed by ampersand) immediately before the opening
+ quote, without any spaces in between, for
+ example U&'foo'. (Note that this creates an
+ ambiguity with the operator &. Use spaces
+ around the operator to avoid this problem.) Inside the quotes,
+ Unicode characters can be specified in escaped form by writing a
+ backslash followed by the four-digit hexadecimal code point
+ number or alternatively a backslash followed by a plus sign
+ followed by a six-digt hexadecimal code point number. For
+ example, the string 'data' could be written as
+
+ U&'d\0061t\0061'
+
+ or equivalently
+
+ U&'d\+000061t\+000061'
+
+ The following less trivial example writes the Russian
+ word slon
(elephant) in Cyrillic letters:
+
+ U&'\0441\043B\043E\043D'
+
+
+
+
+ If a different escape character than backslash is desired, it can
+ be specified using the UESCAPE clause after
+ the string, for example:
+
+ U&'d!0061t!0061' UESCAPE '!'
+
+ The escape character can be any single character other than a
+ hexadecimal digit, the plus sign, a single quote, a double quote,
+ or a whitespace character.
+
+
+
+ The Unicode escape syntax works only when the server encoding is
+ UTF8. When other server encodings are used, only code points in
+ the ASCII range (up to \007F) can be
+ specified.
+
+
+
+ To include the escape character in the string literally, write it
+ twice.
+
+
+
Dollar-Quoted String Constants
Index: src/backend/parser/scan.l
===================================================================
RCS file: /cvsroot/pgsql/src/backend/parser/scan.l,v
retrieving revision 1.146
diff -u -3 -p -c -r1.146 scan.l
*** src/backend/parser/scan.l 1 Sep 2008 20:42:45 -0000 1.146
--- src/backend/parser/scan.l 27 Oct 2008 16:54:27 -0000
*************** static int literalalloc; /* current all
*** 76,81 ****
--- 76,82 ----
static void addlit(char *ytext, int yleng);
static void addlitchar(unsigned char ychar);
static char *litbufdup(void);
+ static char *litbuf_udeescape(unsigned char escape);
#define lexer_errposition() scanner_errposition(yylloc)
*************** static unsigned char unescape_single_cha
*** 125,130 ****
--- 126,133 ----
* standard quoted strings
* extended quoted strings (support backslash escape sequences)
* $foo$ quoted strings
+ * quoted identifier with Unicode escapes
+ * quoted string with Unicode escapes
*/
%x xb
*************** static unsigned char unescape_single_cha
*** 134,139 ****
--- 137,144 ----
%x xe
%x xq
%x xdolq
+ %x xui
+ %x xus
/*
* In order to make the world safe for Windows and Mac clients as well as
*************** xdstop {dquote}
*** 244,249 ****
--- 249,273 ----
xddouble {dquote}{dquote}
xdinside [^"]+
+ /* Unicode escapes */
+ uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+ /* error rule to avoid backup */
+ uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+
+ /* Quoted identifier with Unicode escapes */
+ xuistart [uU]&{dquote}
+ xuistop1 {dquote}{whitespace}*{uescapefail}?
+ xuistop2 {dquote}{whitespace}*{uescape}
+
+ /* Quoted string with Unicode escapes */
+ xusstart [uU]&{quote}
+ xusstop1 {quote}{whitespace}*{uescapefail}?
+ xusstop2 {quote}{whitespace}*{uescape}
+
+ /* error rule to avoid backup */
+ xufailed [uU]&
+
+
/* C-style comments
*
* The "extended comment" syntax closely resembles allowable operator syntax.
*************** other .
*** 444,449 ****
--- 468,478 ----
BEGIN(xe);
startlit();
}
+ {xusstart} {
+ SET_YYLLOC();
+ BEGIN(xus);
+ startlit();
+ }
{quotestop} |
{quotefail} {
yyless(1);
*************** other .
*** 456,465 ****
yylval.str = litbufdup();
return SCONST;
}
! {xqdouble} {
addlitchar('\'');
}
! {xqinside} {
addlit(yytext, yyleng);
}
{xeinside} {
--- 485,506 ----
yylval.str = litbufdup();
return SCONST;
}
! {xusstop1} {
! /* throw back all but the quote */
! yyless(1);
! BEGIN(INITIAL);
! yylval.str = litbuf_udeescape('\\');
! return SCONST;
! }
! {xusstop2} {
! BEGIN(INITIAL);
! yylval.str = litbuf_udeescape(yytext[yyleng-2]);
! return SCONST;
! }
! {xqdouble} {
addlitchar('\'');
}
! {xqinside} {
addlit(yytext, yyleng);
}
{xeinside} {
*************** other .
*** 496,509 ****
if (IS_HIGHBIT_SET(c))
saw_high_bit = true;
}
! {quotecontinue} {
/* ignore */
}
. {
/* This is only needed for \ just before EOF */
addlitchar(yytext[0]);
}
! <> { yyerror("unterminated quoted string"); }
{dolqdelim} {
SET_YYLLOC();
--- 537,550 ----
if (IS_HIGHBIT_SET(c))
saw_high_bit = true;
}
! {quotecontinue} {
/* ignore */
}
. {
/* This is only needed for \ just before EOF */
addlitchar(yytext[0]);
}
! <> { yyerror("unterminated quoted string"); }
{dolqdelim} {
SET_YYLLOC();
*************** other .
*** 553,558 ****
--- 594,604 ----
BEGIN(xd);
startlit();
}
+ {xuistart} {
+ SET_YYLLOC();
+ BEGIN(xui);
+ startlit();
+ }
{xdstop} {
char *ident;
*************** other .
*** 565,577 ****
yylval.str = ident;
return IDENT;
}
! {xddouble} {
addlitchar('"');
}
! {xdinside} {
addlit(yytext, yyleng);
}
! <> { yyerror("unterminated quoted identifier"); }
{typecast} {
SET_YYLLOC();
--- 611,656 ----
yylval.str = ident;
return IDENT;
}
! {xuistop1} {
! char *ident;
!
! BEGIN(INITIAL);
! if (literallen == 0)
! yyerror("zero-length delimited identifier");
! ident = litbuf_udeescape('\\');
! if (literallen >= NAMEDATALEN)
! truncate_identifier(ident, literallen, true);
! yylval.str = ident;
! /* throw back all but the quote */
! yyless(1);
! return IDENT;
! }
! {xuistop2} {
! char *ident;
!
! BEGIN(INITIAL);
! if (literallen == 0)
! yyerror("zero-length delimited identifier");
! ident = litbuf_udeescape(yytext[yyleng - 2]);
! if (literallen >= NAMEDATALEN)
! truncate_identifier(ident, literallen, true);
! yylval.str = ident;
! return IDENT;
! }
! {xddouble} {
addlitchar('"');
}
! {xdinside} {
addlit(yytext, yyleng);
}
! <> { yyerror("unterminated quoted identifier"); }
!
! {xufailed} {
! /* throw back all but the initial u/U */
! yyless(1);
! /* and treat it as {other} */
! return yytext[0];
! }
{typecast} {
SET_YYLLOC();
*************** litbufdup(void)
*** 908,913 ****
--- 987,1082 ----
return new;
}
+ static int
+ hexval(unsigned char c)
+ {
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 0xA;
+ elog(ERROR, "invalid hexadecimal digit");
+ return 0; /* not reached */
+ }
+
+ static void
+ check_unicode_value(pg_wchar c, char * loc)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ return;
+
+ if (c > 0x7F)
+ {
+ yylloc += (char *) loc - literalbuf + 3; /* 3 for U&" */
+ yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+ }
+ }
+
+ static char *
+ litbuf_udeescape(unsigned char escape)
+ {
+ char *new;
+ char *in, *out;
+
+ if (isxdigit(escape)
+ || escape == '+'
+ || escape == '\''
+ || escape == '"'
+ || scanner_isspace(escape))
+ yyerror("invalid Unicode escape character");
+
+ /*
+ * This relies on the subtle assumption that a UTF-8 expansion
+ * cannot be longer than its escaped representation.
+ */
+ new = palloc(literallen + 1);
+
+ in = literalbuf;
+ out = new;
+ while (*in)
+ {
+ if (in[0] == escape)
+ {
+ if (in[1] == escape)
+ {
+ *out++ = escape;
+ in += 2;
+ }
+ else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
+ {
+ pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
+ check_unicode_value(unicode, in);
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ in += 5;
+ out += pg_mblen(out);
+ }
+ else if (in[1] == '+'
+ && isxdigit(in[2]) && isxdigit(in[3])
+ && isxdigit(in[4]) && isxdigit(in[5])
+ && isxdigit(in[6]) && isxdigit(in[7]))
+ {
+ pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+ + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
+ check_unicode_value(unicode, in);
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ in += 8;
+ out += pg_mblen(out);
+ }
+ else
+ {
+ yylloc += in - literalbuf + 3; /* 3 for U&" */
+ yyerror("invalid Unicode escape value");
+ }
+ }
+ else
+ *out++ = *in++;
+ }
+
+ *out = '\0';
+ pg_verifymbstr(new, out - new, false);
+ return new;
+ }
static unsigned char
unescape_single_char(unsigned char c)
Index: src/backend/utils/adt/xml.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/xml.c,v
retrieving revision 1.79
diff -u -3 -p -c -r1.79 xml.c
*** src/backend/utils/adt/xml.c 14 Oct 2008 17:12:33 -0000 1.79
--- src/backend/utils/adt/xml.c 27 Oct 2008 16:54:27 -0000
*************** unicode_to_sqlchar(pg_wchar c)
*** 1497,1524 ****
{
static unsigned char utf8string[5]; /* need trailing zero */
! if (c <= 0x7F)
! {
! utf8string[0] = c;
! }
! else if (c <= 0x7FF)
! {
! utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
! utf8string[1] = 0x80 | (c & 0x3F);
! }
! else if (c <= 0xFFFF)
! {
! utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
! utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
! utf8string[2] = 0x80 | (c & 0x3F);
! }
! else
! {
! utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
! utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
! utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
! utf8string[3] = 0x80 | (c & 0x3F);
! }
return (char *) pg_do_encoding_conversion(utf8string,
pg_mblen((char *) utf8string),
--- 1497,1503 ----
{
static unsigned char utf8string[5]; /* need trailing zero */
! unicode_to_utf8(c, utf8string);
return (char *) pg_do_encoding_conversion(utf8string,
pg_mblen((char *) utf8string),
Index: src/backend/utils/mb/wchar.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.66
diff -u -3 -p -c -r1.66 wchar.c
*** src/backend/utils/mb/wchar.c 15 Nov 2007 21:14:40 -0000 1.66
--- src/backend/utils/mb/wchar.c 27 Oct 2008 16:54:27 -0000
*************** pg_utf2wchar_with_len(const unsigned cha
*** 419,424 ****
--- 419,459 ----
return cnt;
}
+
+ /*
+ * Map a Unicode codepoint to UTF-8. utf8string must have 4 bytes of
+ * space allocated.
+ */
+ unsigned char *
+ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+ {
+ if (c <= 0x7F)
+ {
+ utf8string[0] = c;
+ }
+ else if (c <= 0x7FF)
+ {
+ utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
+ utf8string[1] = 0x80 | (c & 0x3F);
+ }
+ else if (c <= 0xFFFF)
+ {
+ utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
+ utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
+ utf8string[2] = 0x80 | (c & 0x3F);
+ }
+ else
+ {
+ utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
+ utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
+ utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
+ utf8string[3] = 0x80 | (c & 0x3F);
+ }
+
+ return utf8string;
+ }
+
+
/*
* Return the byte length of a UTF8 character pointed to by s
*
Index: src/include/mb/pg_wchar.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.79
diff -u -3 -p -c -r1.79 pg_wchar.h
*** src/include/mb/pg_wchar.h 18 Jun 2008 18:42:54 -0000 1.79
--- src/include/mb/pg_wchar.h 27 Oct 2008 16:54:27 -0000
*************** extern const char *GetDatabaseEncodingNa
*** 380,385 ****
--- 380,386 ----
extern int pg_valid_client_encoding(const char *name);
extern int pg_valid_server_encoding(const char *name);
+ extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
extern int pg_utf_mblen(const unsigned char *);
extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
int src_encoding,