From b05ebe4c4d5123592797c59282e3fab6e8eeed04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20V=C3=A9rit=C3=A9?= Date: Mon, 30 Sep 2024 20:57:50 +0200 Subject: [PATCH v6] Support backslash-dot on a line by itself as valid data in COPY FROM...CSV --- doc/src/sgml/ref/copy.sgml | 6 +-- doc/src/sgml/ref/psql-ref.sgml | 4 -- src/backend/commands/copyfromparse.c | 57 +++++++--------------------- src/bin/psql/copy.c | 28 +++++++++----- src/test/regress/expected/copy.out | 18 +++++++++ src/test/regress/sql/copy.sql | 12 ++++++ 6 files changed, 63 insertions(+), 62 deletions(-) diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 1518af8a04..f7eb59dbac 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -814,11 +814,7 @@ COPY count format, \., the end-of-data marker, could also appear as a data value. To avoid any misinterpretation, a \. data value appearing as a lone entry on a line is automatically - quoted on output, and on input, if quoted, is not interpreted as the - end-of-data marker. If you are loading a file created by another - application that has a single unquoted column and might have a - value of \., you might need to quote that value in the - input file. + quoted on output. diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index 3fd9959ed1..0570211cf2 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -1179,10 +1179,6 @@ SELECT $1 \parse stmt1 destination, because all data must pass through the client/server connection. For large amounts of data the SQL command might be preferable. - Also, because of this pass-through method, \copy - ... from in CSV mode will erroneously - treat a \. data value alone on a line as an - end-of-input marker. diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 97a4c387a3..d0df2d807f 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -136,14 +136,6 @@ if (1) \ } \ } else ((void) 0) -/* Undo any read-ahead and jump out of the block. */ -#define NO_END_OF_COPY_GOTO \ -if (1) \ -{ \ - input_buf_ptr = prev_raw_ptr + 1; \ - goto not_end_of_copy; \ -} else ((void) 0) - /* NOTE: there's a copy of this in copyto.c */ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; @@ -1182,7 +1174,6 @@ CopyReadLineText(CopyFromState cstate) bool result = false; /* CSV variables */ - bool first_char_in_line = true; bool in_quote = false, last_was_esc = false; char quotec = '\0'; @@ -1377,10 +1368,9 @@ CopyReadLineText(CopyFromState cstate) } /* - * In CSV mode, we only recognize \. alone on a line. This is because - * \. is a valid CSV data value. + * In CSV mode, backslash is a normal character. */ - if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line)) + if (c == '\\' && !cstate->opts.csv_mode) { char c2; @@ -1413,21 +1403,15 @@ CopyReadLineText(CopyFromState cstate) if (c2 == '\n') { - if (!cstate->opts.csv_mode) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker does not match previous newline style"))); - else - NO_END_OF_COPY_GOTO; + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker does not match previous newline style"))); } else if (c2 != '\r') { - if (!cstate->opts.csv_mode) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker corrupt"))); - else - NO_END_OF_COPY_GOTO; + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); } } @@ -1438,12 +1422,9 @@ CopyReadLineText(CopyFromState cstate) if (c2 != '\r' && c2 != '\n') { - if (!cstate->opts.csv_mode) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker corrupt"))); - else - NO_END_OF_COPY_GOTO; + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); } if ((cstate->eol_type == EOL_NL && c2 != '\n') || @@ -1467,7 +1448,7 @@ CopyReadLineText(CopyFromState cstate) result = true; /* report EOF */ break; } - else if (!cstate->opts.csv_mode) + else { /* * If we are here, it means we found a backslash followed by @@ -1475,23 +1456,11 @@ CopyReadLineText(CopyFromState cstate) * after a backslash is special, so we skip over that second * character too. If we didn't do that \\. would be * considered an eof-of copy, while in non-CSV mode it is a - * literal backslash followed by a period. In CSV mode, - * backslashes are not special, so we want to process the - * character after the backslash just like a normal character, - * so we don't increment in those cases. + * literal backslash followed by a period. */ input_buf_ptr++; } } - - /* - * This label is for CSV cases where \. appears at the start of a - * line, but there is more text after it, meaning it was a data value. - * We are more strict for \. in CSV mode because \. could be a data - * value, while in non-CSV mode, \. cannot be a data value. - */ -not_end_of_copy: - first_char_in_line = false; } /* end of outer loop */ /* diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c index 961ae32694..57b36a082e 100644 --- a/src/bin/psql/copy.c +++ b/src/bin/psql/copy.c @@ -620,20 +620,30 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res) /* current line is done? */ if (buf[buflen - 1] == '\n') { - /* check for EOF marker, but not on a partial line */ - if (at_line_begin) + /* + * When at the beginning of the line, check for EOF + * marker. If the marker is found and the data is inlined, + * we must stop at this point. If not, the \. line can be + * sent to the server, and we let it decide whether it's + * an EOF or not depending on the format: in basic TEXT, + * \. is going to be interpreted as an EOF, in CSV, it + * will not. + */ + if (at_line_begin && copystream == pset.cur_cmd_source) { - /* - * This code erroneously assumes '\.' on a line alone - * inside a quoted CSV string terminates the \copy. - * https://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@wrigleys.postgresql.org - * - * https://www.postgresql.org/message-id/bfcd57e4-8f23-4c3e-a5db-2571d09208e2@beta.fastmail.com - */ if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) || (linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0)) { copydone = true; + + /* + * Remove the EOF marker from the data sent. In + * the case of CSV, the EOF marker must be + * removed, otherwise it would be interpreted by + * the server as valid data. + */ + *fgresult = '\0'; + buflen -= linelen; } } diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index 44114089a6..174fe05603 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -32,6 +32,24 @@ select * from copytest except select * from copytest2; -------+------+-------- (0 rows) +--- test unquoted \. as data inside CSV +-- do not use copy out to export the data, as it would quote \. +\o :filename +\qecho line1 +\qecho '\\.' +\qecho line2 +\o +-- get the data back in with copy +truncate copytest2; +copy copytest2(test) from :'filename' csv; +select test from copytest2 order by test collate "C"; + test +------- + \. + line1 + line2 +(3 rows) + -- test header line feature create temp table copytest3 ( c1 int, diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index e2dd24cb35..8ed7922ab4 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -38,6 +38,18 @@ copy copytest2 from :'filename' csv quote '''' escape E'\\'; select * from copytest except select * from copytest2; +--- test unquoted \. as data inside CSV +-- do not use copy out to export the data, as it would quote \. +\o :filename +\qecho line1 +\qecho '\\.' +\qecho line2 +\o +-- get the data back in with copy +truncate copytest2; +copy copytest2(test) from :'filename' csv; +select test from copytest2 order by test collate "C"; + -- test header line feature -- 2.34.1