diff --git a/doc/source/io.rst b/doc/source/io.rst
index e72224c6fa1fe..53abc9edb2ebb 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2232,9 +2232,10 @@ Read a URL and match a table that contains specific text
match = 'Metcalf Bank'
df_list = pd.read_html(url, match=match)
-Specify a header row (by default ``
`` elements are used to form the column
-index); if specified, the header row is taken from the data minus the parsed
-header elements (`` | `` elements).
+Specify a header row (by default `` | `` or `` | `` elements located within a
+```` are used to form the column index, if multiple rows are contained within
+```` then a multiindex is created); if specified, the header row is taken
+from the data minus the parsed header elements (```` elements).
.. code-block:: python
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 2e822729873ad..d2d97f71186b9 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -309,6 +309,7 @@ Other Enhancements
- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)
- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`)
- ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`)
+- ``pd.read_html()`` parses multiple header rows, creating a multiindex header. (:issue:`13434`).
- HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`)
- ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 53595b94eb94d..8a3709dba2176 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -355,9 +355,12 @@ def _parse_raw_thead(self, table):
thead = self._parse_thead(table)
res = []
if thead:
- res = lmap(self._text_getter, self._parse_th(thead[0]))
- return np.atleast_1d(
- np.array(res).squeeze()) if res and len(res) == 1 else res
+ trs = self._parse_tr(thead[0])
+ for tr in trs:
+ cols = lmap(self._text_getter, self._parse_td(tr))
+ if any([col != '' for col in cols]):
+ res.append(cols)
+ return res
def _parse_raw_tfoot(self, table):
tfoot = self._parse_tfoot(table)
@@ -591,9 +594,17 @@ def _parse_tfoot(self, table):
return table.xpath('.//tfoot')
def _parse_raw_thead(self, table):
- expr = './/thead//th'
- return [_remove_whitespace(x.text_content()) for x in
- table.xpath(expr)]
+ expr = './/thead'
+ thead = table.xpath(expr)
+ res = []
+ if thead:
+ trs = self._parse_tr(thead[0])
+ for tr in trs:
+ cols = [_remove_whitespace(x.text_content()) for x in
+ self._parse_td(tr)]
+ if any([col != '' for col in cols]):
+ res.append(cols)
+ return res
def _parse_raw_tfoot(self, table):
expr = './/tfoot//th|//tfoot//td'
@@ -615,19 +626,17 @@ def _data_to_frame(**kwargs):
head, body, foot = kwargs.pop('data')
header = kwargs.pop('header')
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
-
if head:
- body = [head] + body
-
+ rows = lrange(len(head))
+ body = head + body
if header is None: # special case when a table has | elements
- header = 0
+ header = 0 if rows == [0] else rows
if foot:
body += [foot]
# fill out elements of body that are "ragged"
_expand_elements(body)
-
tp = TextParser(body, header=header, **kwargs)
df = tp.read()
return df
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index c1a2a4545a6f9..4aa85c0f63a68 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -760,6 +760,18 @@ def test_keep_default_na(self):
html_df = read_html(html_data, keep_default_na=True)[0]
tm.assert_frame_equal(expected_df, html_df)
+ def test_multiple_header_rows(self):
+ # Issue #13434
+ expected_df = DataFrame(data=[("Hillary", 68, "D"),
+ ("Bernie", 74, "D"),
+ ("Donald", 69, "R")])
+ expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
+ ["Name", "Unnamed: 1_level_1",
+ "Unnamed: 2_level_1"]]
+ html = expected_df.to_html(index=False)
+ html_df = read_html(html, )[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
def _lang_enc(filename):
return os.path.splitext(os.path.basename(filename))[0].split('_')
| |