diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 35594fbc8fb8a..5629ebf604697 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -873,6 +873,7 @@ I/O - Bug in :func:`read_csv` interpreting second row as :class:`Index` names even when ``index_col=False`` (:issue:`46569`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) +- Bug in :func:`read_csv` ignoring non-existing header row for ``engine="python"`` (:issue:`47400`) - Bug in :func:`read_excel` raising uncontrolled ``IndexError`` when ``header`` references non-existing rows (:issue:`43143`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) - Bug in :func:`read_csv` when data is longer than header leading to issues with callables in ``usecols`` expecting strings (:issue:`46997`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 2bf66923ddd60..ec953c9df036c 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -379,10 +379,16 @@ def _infer_columns( line = self._next_line() except StopIteration as err: - if self.line_pos < hr: + if 0 < self.line_pos <= hr and ( + not have_mi_columns or hr != header[-1] + ): + # If no rows we want to raise a different message and if + # we have mi columns, the last line is not part of the header + joi = list(map(str, header[:-1] if have_mi_columns else header)) + msg = f"[{','.join(joi)}], len of {len(joi)}, " raise ValueError( - f"Passed header={hr} but only {self.line_pos + 1} lines in " - "file" + f"Passed header={msg}" + f"but only {self.line_pos} lines in file" ) from err # We have an empty file, so check diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 3fc23525df89e..4ded70db8bae7 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -666,3 +666,15 @@ def test_header_none_and_on_bad_lines_skip(all_parsers): ) expected = DataFrame({"a": ["x", "z"], "b": [1, 3]}) tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_header_missing_rows(all_parsers): + # GH#47400 + parser = all_parsers + data = """a,b +1,2 +""" + msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=[0, 1, 2])