Skip to content

Commit b454e8e

Browse files
bpo-27580: Add support of null characters in the csv module. (GH-28808)
1 parent b4903af commit b454e8e

File tree

3 files changed

+70
-38
lines changed

3 files changed

+70
-38
lines changed

Lib/test/test_csv.py

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,17 @@ def test_write_escape(self):
217217
self._write_test(['C\\', '6', '7', 'X"'], 'C\\\\,6,7,"X"""',
218218
escapechar='\\', quoting=csv.QUOTE_MINIMAL)
219219

220+
def test_write_lineterminator(self):
221+
for lineterminator in '\r\n', '\n', '\r', '!@#', '\0':
222+
with self.subTest(lineterminator=lineterminator):
223+
with StringIO() as sio:
224+
writer = csv.writer(sio, lineterminator=lineterminator)
225+
writer.writerow(['a', 'b'])
226+
writer.writerow([1, 2])
227+
self.assertEqual(sio.getvalue(),
228+
f'a,b{lineterminator}'
229+
f'1,2{lineterminator}')
230+
220231
def test_write_iterable(self):
221232
self._write_test(iter(['a', 1, 'p,q']), 'a,1,"p,q"')
222233
self._write_test(iter(['a', 1, None]), 'a,1,')
@@ -286,14 +297,10 @@ def test_read_oddinputs(self):
286297
self._read_test([''], [[]])
287298
self.assertRaises(csv.Error, self._read_test,
288299
['"ab"c'], None, strict = 1)
289-
# cannot handle null bytes for the moment
290-
self.assertRaises(csv.Error, self._read_test,
291-
['ab\0c'], None, strict = 1)
292300
self._read_test(['"ab"c'], [['abc']], doublequote = 0)
293301

294302
self.assertRaises(csv.Error, self._read_test,
295-
[b'ab\0c'], None)
296-
303+
[b'abc'], None)
297304

298305
def test_read_eol(self):
299306
self._read_test(['a,b'], [['a','b']])
@@ -313,13 +320,30 @@ def test_read_eof(self):
313320
self.assertRaises(csv.Error, self._read_test,
314321
['^'], [], escapechar='^', strict=True)
315322

323+
def test_read_nul(self):
324+
self._read_test(['\0'], [['\0']])
325+
self._read_test(['a,\0b,c'], [['a', '\0b', 'c']])
326+
self._read_test(['a,b\0,c'], [['a', 'b\0', 'c']])
327+
self._read_test(['a,b\\\0,c'], [['a', 'b\0', 'c']], escapechar='\\')
328+
self._read_test(['a,"\0b",c'], [['a', '\0b', 'c']])
329+
330+
def test_read_delimiter(self):
331+
self._read_test(['a,b,c'], [['a', 'b', 'c']])
332+
self._read_test(['a;b;c'], [['a', 'b', 'c']], delimiter=';')
333+
self._read_test(['a\0b\0c'], [['a', 'b', 'c']], delimiter='\0')
334+
316335
def test_read_escape(self):
317336
self._read_test(['a,\\b,c'], [['a', 'b', 'c']], escapechar='\\')
318337
self._read_test(['a,b\\,c'], [['a', 'b,c']], escapechar='\\')
319338
self._read_test(['a,"b\\,c"'], [['a', 'b,c']], escapechar='\\')
320339
self._read_test(['a,"b,\\c"'], [['a', 'b,c']], escapechar='\\')
321340
self._read_test(['a,"b,c\\""'], [['a', 'b,c"']], escapechar='\\')
322341
self._read_test(['a,"b,c"\\'], [['a', 'b,c\\']], escapechar='\\')
342+
self._read_test(['a,^b,c'], [['a', 'b', 'c']], escapechar='^')
343+
self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0')
344+
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None)
345+
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar='')
346+
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']])
323347

324348
def test_read_quoting(self):
325349
self._read_test(['1,",3,",5'], [['1', ',3,', '5']])
@@ -334,6 +358,8 @@ def test_read_quoting(self):
334358
self.assertRaises(ValueError, self._read_test,
335359
['abc,3'], [[]],
336360
quoting=csv.QUOTE_NONNUMERIC)
361+
self._read_test(['1,@,3,@,5'], [['1', ',3,', '5']], quotechar='@')
362+
self._read_test(['1,\0,3,\0,5'], [['1', ',3,', '5']], quotechar='\0')
337363

338364
def test_read_bigfield(self):
339365
# This exercises the buffer realloc functionality and field size
@@ -1074,6 +1100,12 @@ class TestSniffer(unittest.TestCase):
10741100
a,b
10751101
""")
10761102

1103+
sample14 = """\
1104+
abc\0def
1105+
ghijkl\0mno
1106+
ghi\0jkl
1107+
"""
1108+
10771109
def test_issue43625(self):
10781110
sniffer = csv.Sniffer()
10791111
self.assertTrue(sniffer.has_header(self.sample12))
@@ -1142,6 +1174,8 @@ def test_delimiters(self):
11421174
dialect = sniffer.sniff(self.sample9)
11431175
self.assertEqual(dialect.delimiter, '+')
11441176
self.assertEqual(dialect.quotechar, "'")
1177+
dialect = sniffer.sniff(self.sample14)
1178+
self.assertEqual(dialect.delimiter, '\0')
11451179

11461180
def test_doublequote(self):
11471181
sniffer = csv.Sniffer()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support of null characters in :mod:`csv`.

Modules/_csv.c

Lines changed: 30 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ module instead.
1414
#include "structmember.h" // PyMemberDef
1515
#include <stdbool.h>
1616

17+
#define NOT_SET ((Py_UCS4)-1)
18+
#define EOL ((Py_UCS4)-2)
19+
1720

1821
typedef struct {
1922
PyObject *error_obj; /* CSV exception */
@@ -153,9 +156,9 @@ get_dialect_from_registry(PyObject *name_obj, _csvstate *module_state)
153156
}
154157

155158
static PyObject *
156-
get_nullchar_as_None(Py_UCS4 c)
159+
get_char_or_None(Py_UCS4 c)
157160
{
158-
if (c == '\0') {
161+
if (c == NOT_SET) {
159162
Py_RETURN_NONE;
160163
}
161164
else
@@ -172,19 +175,19 @@ Dialect_get_lineterminator(DialectObj *self, void *Py_UNUSED(ignored))
172175
static PyObject *
173176
Dialect_get_delimiter(DialectObj *self, void *Py_UNUSED(ignored))
174177
{
175-
return get_nullchar_as_None(self->delimiter);
178+
return get_char_or_None(self->delimiter);
176179
}
177180

178181
static PyObject *
179182
Dialect_get_escapechar(DialectObj *self, void *Py_UNUSED(ignored))
180183
{
181-
return get_nullchar_as_None(self->escapechar);
184+
return get_char_or_None(self->escapechar);
182185
}
183186

184187
static PyObject *
185188
Dialect_get_quotechar(DialectObj *self, void *Py_UNUSED(ignored))
186189
{
187-
return get_nullchar_as_None(self->quotechar);
190+
return get_char_or_None(self->quotechar);
188191
}
189192

190193
static PyObject *
@@ -235,7 +238,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
235238
*target = dflt;
236239
}
237240
else {
238-
*target = '\0';
241+
*target = NOT_SET;
239242
if (src != Py_None) {
240243
if (!PyUnicode_Check(src)) {
241244
PyErr_Format(PyExc_TypeError,
@@ -254,7 +257,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
254257
return -1;
255258
}
256259
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
257-
else {
260+
else if (len > 0) {
258261
*target = PyUnicode_READ_CHAR(src, 0);
259262
}
260263
}
@@ -269,7 +272,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
269272
*target = dflt;
270273
}
271274
else {
272-
*target = '\0';
275+
*target = NOT_SET;
273276
if (!PyUnicode_Check(src)) {
274277
PyErr_Format(PyExc_TypeError,
275278
"\"%s\" must be string, not %.200s", name,
@@ -287,7 +290,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
287290
return -1;
288291
}
289292
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
290-
else {
293+
else if (len > 0) {
291294
*target = PyUnicode_READ_CHAR(src, 0);
292295
}
293296
}
@@ -481,7 +484,7 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
481484
goto err
482485
DIASET(_set_char, "delimiter", &self->delimiter, delimiter, ',');
483486
DIASET(_set_bool, "doublequote", &self->doublequote, doublequote, true);
484-
DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, 0);
487+
DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, NOT_SET);
485488
DIASET(_set_str, "lineterminator", &self->lineterminator, lineterminator, "\r\n");
486489
DIASET(_set_char_or_none, "quotechar", &self->quotechar, quotechar, '"');
487490
DIASET(_set_int, "quoting", &self->quoting, quoting, QUOTE_MINIMAL);
@@ -491,19 +494,19 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
491494
/* validate options */
492495
if (dialect_check_quoting(self->quoting))
493496
goto err;
494-
if (self->delimiter == 0) {
497+
if (self->delimiter == NOT_SET) {
495498
PyErr_SetString(PyExc_TypeError,
496499
"\"delimiter\" must be a 1-character string");
497500
goto err;
498501
}
499502
if (quotechar == Py_None && quoting == NULL)
500503
self->quoting = QUOTE_NONE;
501-
if (self->quoting != QUOTE_NONE && self->quotechar == 0) {
504+
if (self->quoting != QUOTE_NONE && self->quotechar == NOT_SET) {
502505
PyErr_SetString(PyExc_TypeError,
503506
"quotechar must be set if quoting enabled");
504507
goto err;
505508
}
506-
if (self->lineterminator == 0) {
509+
if (self->lineterminator == NULL) {
507510
PyErr_SetString(PyExc_TypeError, "lineterminator must be set");
508511
goto err;
509512
}
@@ -670,7 +673,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
670673
switch (self->state) {
671674
case START_RECORD:
672675
/* start of record */
673-
if (c == '\0')
676+
if (c == EOL)
674677
/* empty line - return [] */
675678
break;
676679
else if (c == '\n' || c == '\r') {
@@ -682,11 +685,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
682685
/* fallthru */
683686
case START_FIELD:
684687
/* expecting field */
685-
if (c == '\n' || c == '\r' || c == '\0') {
688+
if (c == '\n' || c == '\r' || c == EOL) {
686689
/* save empty field - return [fields] */
687690
if (parse_save_field(self) < 0)
688691
return -1;
689-
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
692+
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
690693
}
691694
else if (c == dialect->quotechar &&
692695
dialect->quoting != QUOTE_NONE) {
@@ -722,25 +725,25 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
722725
self->state = AFTER_ESCAPED_CRNL;
723726
break;
724727
}
725-
if (c == '\0')
728+
if (c == EOL)
726729
c = '\n';
727730
if (parse_add_char(self, module_state, c) < 0)
728731
return -1;
729732
self->state = IN_FIELD;
730733
break;
731734

732735
case AFTER_ESCAPED_CRNL:
733-
if (c == '\0')
736+
if (c == EOL)
734737
break;
735738
/*fallthru*/
736739

737740
case IN_FIELD:
738741
/* in unquoted field */
739-
if (c == '\n' || c == '\r' || c == '\0') {
742+
if (c == '\n' || c == '\r' || c == EOL) {
740743
/* end of line - return [fields] */
741744
if (parse_save_field(self) < 0)
742745
return -1;
743-
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
746+
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
744747
}
745748
else if (c == dialect->escapechar) {
746749
/* possible escaped character */
@@ -761,7 +764,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
761764

762765
case IN_QUOTED_FIELD:
763766
/* in quoted field */
764-
if (c == '\0')
767+
if (c == EOL)
765768
;
766769
else if (c == dialect->escapechar) {
767770
/* Possible escape character */
@@ -786,7 +789,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
786789
break;
787790

788791
case ESCAPE_IN_QUOTED_FIELD:
789-
if (c == '\0')
792+
if (c == EOL)
790793
c = '\n';
791794
if (parse_add_char(self, module_state, c) < 0)
792795
return -1;
@@ -808,11 +811,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
808811
return -1;
809812
self->state = START_FIELD;
810813
}
811-
else if (c == '\n' || c == '\r' || c == '\0') {
814+
else if (c == '\n' || c == '\r' || c == EOL) {
812815
/* end of line - return [fields] */
813816
if (parse_save_field(self) < 0)
814817
return -1;
815-
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
818+
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
816819
}
817820
else if (!dialect->strict) {
818821
if (parse_add_char(self, module_state, c) < 0)
@@ -831,7 +834,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
831834
case EAT_CRNL:
832835
if (c == '\n' || c == '\r')
833836
;
834-
else if (c == '\0')
837+
else if (c == EOL)
835838
self->state = START_RECORD;
836839
else {
837840
PyErr_Format(module_state->error_obj,
@@ -909,20 +912,14 @@ Reader_iternext(ReaderObj *self)
909912
linelen = PyUnicode_GET_LENGTH(lineobj);
910913
while (linelen--) {
911914
c = PyUnicode_READ(kind, data, pos);
912-
if (c == '\0') {
913-
Py_DECREF(lineobj);
914-
PyErr_Format(module_state->error_obj,
915-
"line contains NUL");
916-
goto err;
917-
}
918915
if (parse_process_char(self, module_state, c) < 0) {
919916
Py_DECREF(lineobj);
920917
goto err;
921918
}
922919
pos++;
923920
}
924921
Py_DECREF(lineobj);
925-
if (parse_process_char(self, module_state, 0) < 0)
922+
if (parse_process_char(self, module_state, EOL) < 0)
926923
goto err;
927924
} while (self->state != START_RECORD);
928925

@@ -1127,7 +1124,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, const void *field_dat
11271124
*quoted = 1;
11281125
}
11291126
if (want_escape) {
1130-
if (!dialect->escapechar) {
1127+
if (dialect->escapechar == NOT_SET) {
11311128
PyErr_Format(self->error_obj,
11321129
"need to escape, but no escapechar set");
11331130
return -1;

0 commit comments

Comments
 (0)