From 11ccfab24d09413dcc088ba06a78cc3bf006c35d Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Thu, 15 Sep 2011 17:13:57 +0100 Subject: [PATCH 1/3] read_csv automatically sniffs out separator, using csv.Sniffer() --- pandas/io/parsers.py | 45 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 06a0aae6328d9..9bf93d118e9b2 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -12,8 +12,8 @@ from pandas.core.index import Index from pandas.core.frame import DataFrame -def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, - na_values=None, date_parser=None, names=None): +def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, + na_values=None, date_parser=None, names=None, sniff_sep=True): """ Read CSV file into DataFrame @@ -34,6 +34,9 @@ def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, dateutil.parser names : array-like List of column names + sniff_sep : boolean, default True + Attempt to automatically determine the separator for the data. Defaults + to True, however if sep is defined then it will take precedence Returns ------- @@ -50,7 +53,19 @@ def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, except Exception: # pragma: no cover f = open(filepath_or_buffer, 'r') - reader = csv.reader(f, dialect='excel') + # default dialect + dia = csv.excel + if sep is not None: + sniff_sep = False + dia.delimiter = sep + # attempt to sniff the delimiter + if sniff_sep: + sample = f.readline() + sniffed = csv.Sniffer().sniff(sample) + dia.delimiter = sniffed.delimiter + f.seek(0) + + reader = csv.reader(f, dialect=dia) if skiprows is not None: skiprows = set(skiprows) @@ -63,8 +78,7 @@ def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, date_parser=date_parser) def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, - index_col=0, na_values=None, names=None, - date_parser=None): + index_col=0, na_values=None, date_parser=None, names=None): """ Read delimited file into DataFrame @@ -92,25 +106,8 @@ def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, ------- parsed : DataFrame """ - if hasattr(filepath_or_buffer, 'read'): - reader = filepath_or_buffer - else: - try: - # universal newline mode - reader = open(filepath_or_buffer, 'U') - except Exception: # pragma: no cover - reader = open(filepath_or_buffer, 'r') - - if skiprows is not None: - skiprows = set(skiprows) - lines = [l for i, l in enumerate(reader) if i not in skiprows] - else: - lines = [l for l in reader] - - lines = [re.split(sep, l.rstrip()) for l in lines] - return _simple_parser(lines, header=header, indexCol=index_col, - colNames=names, na_values=na_values, - date_parser=date_parser) + return read_csv(filepath_or_buffer, sep, header, skiprows, + index_col, na_values, date_parser, names) def _simple_parser(lines, colNames=None, header=0, indexCol=0, na_values=None, date_parser=None, parse_dates=True): From 79f841d8ba2d819a5e63d77571f6ef46b50d71ab Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Fri, 16 Sep 2011 10:21:38 +0100 Subject: [PATCH 2/3] Adding additional comment to docstring --- pandas/io/parsers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9bf93d118e9b2..cd9b18718689e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,6 +20,9 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, Parameters ---------- filepath_or_buffer : string or file handle / StringIO + sep : string, default None + Delimiter to use. By default will try to automatically determine + this header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like From ef5515cbb6684a1259319bbccb2de526f575c461 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Fri, 16 Sep 2011 10:27:12 +0100 Subject: [PATCH 3/3] Remove sniff_sep from params; cleaner and simpler --- pandas/io/parsers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index cd9b18718689e..02205e79a5778 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,7 +13,7 @@ from pandas.core.frame import DataFrame def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, - na_values=None, date_parser=None, names=None, sniff_sep=True): + na_values=None, date_parser=None, names=None): """ Read CSV file into DataFrame @@ -37,9 +37,6 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, dateutil.parser names : array-like List of column names - sniff_sep : boolean, default True - Attempt to automatically determine the separator for the data. Defaults - to True, however if sep is defined then it will take precedence Returns ------- @@ -56,6 +53,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, except Exception: # pragma: no cover f = open(filepath_or_buffer, 'r') + sniff_sep = True # default dialect dia = csv.excel if sep is not None: