diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2b0b62ab7facf..96a702622b1dc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -750,6 +750,7 @@ I/O - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) +- Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 153ac4b5f0893..1a5ac31cc821b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -707,7 +707,8 @@ cdef class TextReader: ic = (len(self.index_col) if self.index_col is not None else 0) - if lc != unnamed_count and lc - ic > unnamed_count: + # if wrong number of blanks or no index, not our format + if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0: hr -= 1 self.parser_start -= 1 this_header = [None] * lc diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 7eefd26b194ab..673c023325e3e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -551,7 +551,11 @@ def parse( header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) - has_index_names = is_list_like(header) and len(header) > 1 + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = ( + is_list_like(header) and len(header) > 1 and index_col is not None + ) if is_list_like(index_col): # Forward fill values for MultiIndex index. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 37f553c724c9e..cbb0dd68ef038 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -431,7 +431,8 @@ def _infer_columns(self): ic = len(self.index_col) if self.index_col is not None else 0 unnamed_count = len(this_unnamed_cols) - if lc != unnamed_count and lc - ic > unnamed_count: + # if wrong number of blanks or no index, not our format + if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0: clear_buffer = False this_columns = [None] * lc self.buf = [self.buf[-1]] diff --git a/pandas/tests/io/data/excel/testmultiindex.ods b/pandas/tests/io/data/excel/testmultiindex.ods index deb88bdad1694..dca8d70abdc24 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.ods and b/pandas/tests/io/data/excel/testmultiindex.ods differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xls b/pandas/tests/io/data/excel/testmultiindex.xls index 08dc78ea34d56..c91698be29b13 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xls and b/pandas/tests/io/data/excel/testmultiindex.xls differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsb b/pandas/tests/io/data/excel/testmultiindex.xlsb index f5f62d305640f..a693e0c66afc2 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsb and b/pandas/tests/io/data/excel/testmultiindex.xlsb differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsm b/pandas/tests/io/data/excel/testmultiindex.xlsm index 8bd16b016608c..5a2a4ea35f0d9 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsm and b/pandas/tests/io/data/excel/testmultiindex.xlsm differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsx b/pandas/tests/io/data/excel/testmultiindex.xlsx index 56fc6f20b711a..a6174445bb83a 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsx and b/pandas/tests/io/data/excel/testmultiindex.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 382c8412ab050..c4b3221e1d3a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1193,6 +1193,17 @@ def test_one_col_noskip_blank_line(self, read_ext): result = pd.read_excel(file_name) tm.assert_frame_equal(result, expected) + def test_multiheader_two_blank_lines(self, read_ext): + # GH 40442 + file_name = "testmultiindex" + read_ext + columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) + data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]] + expected = DataFrame(data, columns=columns) + result = pd.read_excel( + file_name, sheet_name="mi_column_empty_rows", header=[0, 1] + ) + tm.assert_frame_equal(result, expected) + class TestExcelFileRead: @pytest.fixture(autouse=True) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index f15fc16fbce38..3b814360d3aa4 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -389,6 +389,17 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +def test_header_multi_index_blank_line(all_parsers): + # GH 40442 + parser = all_parsers + data = [[None, None], [1, 2], [3, 4]] + columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) + expected = DataFrame(data, columns=columns) + data = "a,b\nA,B\n,\n1,2\n3,4" + result = parser.read_csv(StringIO(data), header=[0, 1]) + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] )