diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 48a3bfdab62c9..0e38396d156e1 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -286,6 +286,7 @@ Other Enhancements - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) +- :func:`read_fwf` now accepts keyword `infer_nrows` (:issue:`15138`). - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`8917`) - :meth:`Index.difference` now has an optional ``sort`` parameter to specify whether the results should be sorted if possible (:issue:`17839`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index acb9bca2545c0..4bbd69cb5c8ad 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -501,6 +501,7 @@ def _read(filepath_or_buffer, kwds): _fwf_defaults = { 'colspecs': 'infer', + 'infer_nrows': 100, 'widths': None, } @@ -718,8 +719,8 @@ def parser_f(filepath_or_buffer, )(read_table) -def read_fwf(filepath_or_buffer, colspecs='infer', - widths=None, **kwds): +def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, + infer_nrows=100, **kwds): r""" Read a table of fixed-width formatted lines into DataFrame. @@ -752,6 +753,11 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths : list of int, optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. + infer_nrows : int, default 100 + The number of rows to consider when letting the parser determine the + `colspecs`. + + .. versionadded:: 0.24.0 **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. @@ -786,6 +792,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', col += w kwds['colspecs'] = colspecs + kwds['infer_nrows'] = infer_nrows kwds['engine'] = 'python-fwf' return _read(filepath_or_buffer, kwds) @@ -3442,13 +3449,15 @@ class FixedWidthReader(BaseIterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment, skiprows=None): + def __init__(self, f, colspecs, delimiter, comment, skiprows=None, + infer_nrows=100): self.f = f self.buffer = None self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' self.comment = comment if colspecs == 'infer': - self.colspecs = self.detect_colspecs(skiprows=skiprows) + self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows, + skiprows=skiprows) else: self.colspecs = colspecs @@ -3464,19 +3473,20 @@ def __init__(self, f, colspecs, delimiter, comment, skiprows=None): raise TypeError('Each column specification must be ' '2 element tuple or list of integers') - def get_rows(self, n, skiprows=None): + def get_rows(self, infer_nrows, skiprows=None): """ Read rows from self.f, skipping as specified. - We distinguish buffer_rows (the first <= n lines) - from the rows returned to detect_colspecs because - it's simpler to leave the other locations with - skiprows logic alone than to modify them to deal - with the fact we skipped some rows here as well. + We distinguish buffer_rows (the first <= infer_nrows + lines) from the rows returned to detect_colspecs + because it's simpler to leave the other locations + with skiprows logic alone than to modify them to + deal with the fact we skipped some rows here as + well. Parameters ---------- - n : int + infer_nrows : int Number of rows to read from self.f, not counting rows that are skipped. skiprows: set, optional @@ -3496,16 +3506,16 @@ def get_rows(self, n, skiprows=None): if i not in skiprows: detect_rows.append(row) buffer_rows.append(row) - if len(detect_rows) >= n: + if len(detect_rows) >= infer_nrows: break self.buffer = iter(buffer_rows) return detect_rows - def detect_colspecs(self, n=100, skiprows=None): + def detect_colspecs(self, infer_nrows=100, skiprows=None): # Regex escape the delimiters delimiters = ''.join(r'\%s' % x for x in self.delimiter) pattern = re.compile('([^%s]+)' % delimiters) - rows = self.get_rows(n, skiprows) + rows = self.get_rows(infer_nrows, skiprows) if not rows: raise EmptyDataError("No rows from which to infer column width") max_len = max(map(len, rows)) @@ -3544,8 +3554,10 @@ class FixedWidthFieldParser(PythonParser): def __init__(self, f, **kwds): # Support iterators, convert to a list. self.colspecs = kwds.pop('colspecs') + self.infer_nrows = kwds.pop('infer_nrows') PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - self.comment, self.skiprows) + self.comment, self.skiprows, + self.infer_nrows) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index bb64a85590c8b..1c89e41df8df9 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -140,6 +140,22 @@ def test_fwf_colspecs_None(self): expected = DataFrame([[123456, 456], [456789, 789]]) tm.assert_frame_equal(result, expected) + def test_fwf_colspecs_infer_nrows(self): + # GH 15138 + data = """\ + 1 2 +123 98 +""" + # infer_nrows == 1 should have colspec == [(2, 3), (5, 6)] + df = read_fwf(StringIO(data), header=None, infer_nrows=1) + expected = pd.DataFrame([[1, 2], [3, 8]]) + tm.assert_frame_equal(df, expected) + + # test for infer_nrows > number of rows + df = read_fwf(StringIO(data), header=None, infer_nrows=10) + expected = pd.DataFrame([[1, 2], [123, 98]]) + tm.assert_frame_equal(df, expected) + def test_fwf_regression(self): # GH 3594 # turns out 'T060' is parsable as a datetime slice!