diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml ./doc/src/sgml/ref/copy.sgml
*** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml Mon Jan 3 19:39:53 2005
--- ./doc/src/sgml/ref/copy.sgml Sun Feb 20 19:18:54 2005
***************
*** 496,508 ****
CSV mode will both recognize and produce CSV files with quoted
values containing embedded carriage returns and line feeds. Thus
! the files are not strictly one line per table row like text-mode
! files. However, PostgreSQL will reject
! COPY input if any fields contain embedded line
! end character sequences that do not match the line ending
! convention used in the CSV file itself. It is generally safer to
! import data containing embedded line end characters using the
! text or binary formats rather than CSV.
--- 496,503 ----
CSV mode will both recognize and produce CSV files with quoted
values containing embedded carriage returns and line feeds. Thus
! the files are not strictly one line per table row as are text-mode
! files.
***************
*** 513,518 ****
--- 508,515 ----
might encounter some files that cannot be imported using this
mechanism, and COPY> might produce files that other
programs cannot process.
+ It is generally safer to import data using the text or binary formats,
+ if possible, rather than using CSV format.
diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c ./src/backend/commands/copy.c
*** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c Fri Dec 31 16:59:41 2004
--- ./src/backend/commands/copy.c Sun Feb 20 13:40:56 2005
***************
*** 98,104 ****
static EolType eol_type; /* EOL type of input */
static int client_encoding; /* remote side's character encoding */
static int server_encoding; /* local encoding */
- static bool embedded_line_warning;
/* these are just for error messages, see copy_in_error_callback */
static bool copy_binary; /* is it a binary copy? */
--- 98,103 ----
***************
*** 140,145 ****
--- 139,145 ----
char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
List *force_notnull_atts);
static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull);
static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1191,1197 ****
attr = tupDesc->attrs;
num_phys_attrs = tupDesc->natts;
attr_count = list_length(attnumlist);
- embedded_line_warning = false;
/*
* Get info about the columns we need to process.
--- 1191,1196 ----
***************
*** 1718,1724 ****
ListCell *cur;
/* Actually read the line into memory here */
! done = CopyReadLine();
/*
* EOF at start of line means we're done. If we see EOF after
--- 1717,1723 ----
ListCell *cur;
/* Actually read the line into memory here */
! done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine();
/*
* EOF at start of line means we're done. If we see EOF after
***************
*** 2194,2199 ****
--- 2193,2448 ----
return result;
}
+ /*
+ * Read a line for CSV copy mode. Differences from standard mode:
+ * . CR an NL are not special inside quoted fields - they just get added
+ * to the buffer.
+ * . \ is not magical except as the start of the end of data marker.
+ *
+ */
+
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+ bool result;
+ bool change_encoding = (client_encoding != server_encoding);
+ int c;
+ int mblen;
+ int j;
+ unsigned char s[2];
+ char *cvt;
+ bool in_quote = false, last_was_esc = false;
+ char quotec = quote[0];
+ char escapec = escape[0];
+
+ s[1] = 0;
+
+ /* ignore special escape processing if it's the same as quote */
+ if (quotec == escapec)
+ escapec = '\0';
+
+ /* reset line_buf to empty */
+ line_buf.len = 0;
+ line_buf.data[0] = '\0';
+ line_buf.cursor = 0;
+
+ /* mark that encoding conversion hasn't occurred yet */
+ line_buf_converted = false;
+
+ /* set default status */
+ result = false;
+
+ /*
+ * In this loop we only care for detecting newlines (\r and/or \n)
+ * and the end-of-copy marker (\.). These four
+ * characters, and only these four, are assumed the same in frontend
+ * and backend encodings. We do not assume that second and later bytes
+ * of a frontend multibyte character couldn't look like ASCII characters.
+ *
+ * What about the encoding implications of the quote / excape chars?
+ *
+ * However, CR and NL characters that are inside a quoted field are
+ * not special, and are simply a part of the data value. The parsing rule
+ * used is a bit rough and ready, but probably adequate for our purposes.
+ */
+
+ for (;;)
+ {
+ c = CopyGetChar();
+ if (c == EOF)
+ {
+ result = true;
+ break;
+ }
+
+ /*
+ * Dealing with quotes and escapes here is mildly tricky. If the
+ * quote char is also the escape char, there's no problem - we
+ * just use the char as a toggle. If they are different, we need
+ * to ensure that we only take account of an escape inside a quoted
+ * field and immediately preceding a quote char, and not the
+ * second in a escape-escape sequence.
+ */
+
+ if (in_quote && c == escapec)
+ last_was_esc = ! last_was_esc;
+
+ if (c == quotec && ! last_was_esc)
+ in_quote = ! in_quote;
+
+ if (c != escapec)
+ last_was_esc = false;
+
+ /*
+ * updating the line count for embedded CR and/or LF chars is
+ * necessarily a little fragile - this test is probably about
+ * the best we can do.
+ */
+ if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
+ copy_lineno++;
+
+ if (!in_quote && c == '\r')
+ {
+ if (eol_type == EOL_NL)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("unquoted carriage return found in CSV data"),
+ errhint("Use quoted CSV field to represent carriage return.")));
+ /* Check for \r\n on first line, _and_ handle \r\n. */
+ if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
+ {
+ int c2 = CopyPeekChar();
+
+ if (c2 == '\n')
+ {
+ CopyDonePeek(c2, true); /* eat newline */
+ eol_type = EOL_CRNL;
+ }
+ else
+ {
+ /* found \r, but no \n */
+ if (eol_type == EOL_CRNL)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("unquoted carriage return found in CSV data"),
+ errhint("Use quoted CSV field to represent carriage return.")));
+
+ /*
+ * if we got here, it is the first line and we didn't
+ * get \n, so put it back
+ */
+ CopyDonePeek(c2, false);
+ eol_type = EOL_CR;
+ }
+ }
+ break;
+ }
+ if (!in_quote && c == '\n')
+ {
+ if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("unquoted newline found in CSV data"),
+ errhint("Use quoted CSV field to represent newline.")));
+ eol_type = EOL_NL;
+ break;
+ }
+
+ /* \ is only potentially magical at the start of a line */
+ if (line_buf.len == 0 && c == '\\')
+ {
+ int c2 = CopyPeekChar();
+
+ if (c2 == EOF)
+ {
+ result = true;
+
+ CopyDonePeek(c2, true); /* eat it - do we need to? */
+
+ break;
+ }
+ if (c2 == '.')
+ {
+
+ CopyDonePeek(c2, true); /* so we can keep calling GetChar() */
+
+ if (eol_type == EOL_CRNL)
+ {
+ c = CopyGetChar();
+ if (c == '\n')
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker does not match previous newline style")));
+ if (c != '\r')
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker corrupt")));
+ }
+ c = CopyGetChar();
+ if (c != '\r' && c != '\n')
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker corrupt")));
+ if ((eol_type == EOL_NL && c != '\n') ||
+ (eol_type == EOL_CRNL && c != '\n') ||
+ (eol_type == EOL_CR && c != '\r'))
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker does not match previous newline style")));
+
+ /*
+ * In protocol version 3, we should ignore anything
+ * after \. up to the protocol end of copy data. (XXX
+ * maybe better not to treat \. as special?)
+ */
+ if (copy_dest == COPY_NEW_FE)
+ {
+ while (c != EOF)
+ c = CopyGetChar();
+ }
+ result = true; /* report EOF */
+ break;
+ }
+
+ CopyDonePeek(c2, false); /* not a dot, so put it back */
+
+ }
+
+ appendStringInfoCharMacro(&line_buf, c);
+
+ /*
+ * When client encoding != server, must be careful to read the
+ * extra bytes of a multibyte character exactly, since the encoding
+ * might not ensure they don't look like ASCII. When the encodings
+ * are the same, we need not do this, since no server encoding we
+ * use has ASCII-like following bytes.
+ */
+ if (change_encoding)
+ {
+ s[0] = c;
+ mblen = pg_encoding_mblen(client_encoding, s);
+ for (j = 1; j < mblen; j++)
+ {
+ c = CopyGetChar();
+ if (c == EOF)
+ {
+ result = true;
+ break;
+ }
+ appendStringInfoCharMacro(&line_buf, c);
+ }
+ if (result)
+ break; /* out of outer loop */
+ }
+ } /* end of outer loop */
+
+ /*
+ * Done reading the line. Convert it to server encoding.
+ *
+ * Note: set line_buf_converted to true *before* attempting conversion;
+ * this prevents infinite recursion during error reporting should
+ * pg_client_to_server() issue an error, due to copy_in_error_callback
+ * again attempting the same conversion. We'll end up issuing the message
+ * without conversion, which is bad but better than nothing ...
+ */
+ line_buf_converted = true;
+
+ if (change_encoding)
+ {
+ cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data,
+ line_buf.len);
+ if (cvt != line_buf.data)
+ {
+ /* transfer converted data back to line_buf */
+ line_buf.len = 0;
+ line_buf.data[0] = '\0';
+ appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
+ }
+ }
+
+ return result;
+ }
+
/*----------
* Read the value of a single attribute, performing de-escaping as needed.
*
***************
*** 2369,2402 ****
for (;;)
{
- /* handle multiline quoted fields */
- if (in_quote && line_buf.cursor >= line_buf.len)
- {
- bool done;
-
- switch (eol_type)
- {
- case EOL_NL:
- appendStringInfoString(&attribute_buf, "\n");
- break;
- case EOL_CR:
- appendStringInfoString(&attribute_buf, "\r");
- break;
- case EOL_CRNL:
- appendStringInfoString(&attribute_buf, "\r\n");
- break;
- case EOL_UNKNOWN:
- /* shouldn't happen - just keep going */
- break;
- }
-
- copy_lineno++;
- done = CopyReadLine();
- if (done && line_buf.len == 0)
- break;
- start_cursor = line_buf.cursor;
- }
-
end_cursor = line_buf.cursor;
if (line_buf.cursor >= line_buf.len)
break;
--- 2618,2623 ----
***************
*** 2629,2653 ****
!use_quote && (c = *test_string) != '\0';
test_string += mblen)
{
- /*
- * We don't know here what the surrounding line end characters
- * might be. It might not even be under postgres' control. So
- * we simple warn on ANY embedded line ending character.
- *
- * This warning will disappear when we make line parsing field-aware,
- * so that we can reliably read in embedded line ending characters
- * regardless of the file's line-end context.
- *
- */
-
- if (!embedded_line_warning && (c == '\n' || c == '\r') )
- {
- embedded_line_warning = true;
- elog(WARNING,
- "CSV fields with embedded linefeed or carriage return "
- "characters might not be able to be reimported");
- }
-
if (c == delimc || c == quotec || c == '\n' || c == '\r')
use_quote = true;
if (!same_encoding)
--- 2850,2855 ----