-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
ENH: pd.read_html argument to extract hrefs along with text from cells #45973
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
d69ce74
ac86888
b33dc9e
a13c5f0
76ebe35
cd352e7
1de1324
1190ea7
db8b6db
1c8c891
1555fbd
0935696
afaad1a
20e24e9
ffdcf8a
dbd4580
490005a
a5ff5c1
85a183d
58fdb0c
c34d8ff
7389b84
ba7caab
4c7f532
98a46e2
fd41935
614c636
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
import numbers | ||
import re | ||
from typing import ( | ||
Literal, | ||
Pattern, | ||
Sequence, | ||
cast, | ||
|
@@ -180,6 +181,11 @@ class _HtmlFrameParser: | |
displayed_only : bool | ||
Whether or not items with "display:none" should be ignored | ||
|
||
extract_hrefs : all/header/body/footer or None | ||
Table elements in the specified section(s) with <a> tags will have their | ||
href extracted. Note that specifying "header" will result in a | ||
:class:`~pandas.MultiIndex`. | ||
|
||
Attributes | ||
---------- | ||
io : str or file-like | ||
|
@@ -198,11 +204,15 @@ class _HtmlFrameParser: | |
displayed_only : bool | ||
Whether or not items with "display:none" should be ignored | ||
|
||
extract_hrefs : bool, default False | ||
Whether table elements with <a> tags should have the href extracted. | ||
|
||
Notes | ||
----- | ||
To subclass this class effectively you must override the following methods: | ||
* :func:`_build_doc` | ||
* :func:`_attr_getter` | ||
* :func:`_href_getter` | ||
* :func:`_text_getter` | ||
* :func:`_parse_td` | ||
* :func:`_parse_thead_tr` | ||
|
@@ -221,12 +231,14 @@ def __init__( | |
attrs: dict[str, str] | None, | ||
encoding: str, | ||
displayed_only: bool, | ||
extract_hrefs: Literal["all", "header", "body", "footer", None], | ||
): | ||
self.io = io | ||
self.match = match | ||
self.attrs = attrs | ||
self.encoding = encoding | ||
self.displayed_only = displayed_only | ||
self.extract_hrefs = extract_hrefs | ||
|
||
def parse_tables(self): | ||
""" | ||
|
@@ -259,6 +271,22 @@ def _attr_getter(self, obj, attr): | |
# Both lxml and BeautifulSoup have the same implementation: | ||
return obj.get(attr) | ||
|
||
def _href_getter(self, obj): | ||
""" | ||
Return a href if the DOM node contains a child <a> or None. | ||
|
||
Parameters | ||
---------- | ||
obj : node-like | ||
A DOM node. | ||
|
||
Returns | ||
------- | ||
href : str or unicode | ||
The href from the <a> child of the DOM node. | ||
""" | ||
raise AbstractMethodError(self) | ||
|
||
def _text_getter(self, obj): | ||
""" | ||
Return the text of an individual DOM node. | ||
|
@@ -435,20 +463,23 @@ def row_is_all_th(row): | |
while body_rows and row_is_all_th(body_rows[0]): | ||
header_rows.append(body_rows.pop(0)) | ||
|
||
header = self._expand_colspan_rowspan(header_rows) | ||
body = self._expand_colspan_rowspan(body_rows) | ||
footer = self._expand_colspan_rowspan(footer_rows) | ||
header = self._expand_colspan_rowspan(header_rows, section="header") | ||
body = self._expand_colspan_rowspan(body_rows, section="body") | ||
footer = self._expand_colspan_rowspan(footer_rows, section="footer") | ||
|
||
return header, body, footer | ||
|
||
def _expand_colspan_rowspan(self, rows): | ||
def _expand_colspan_rowspan( | ||
self, rows, section: Literal["header", "body", "footer"] | ||
): | ||
""" | ||
Given a list of <tr>s, return a list of text rows. | ||
|
||
Parameters | ||
---------- | ||
rows : list of node-like | ||
List of <tr>s | ||
section : the section that the rows belong to (header, body or footer). | ||
|
||
Returns | ||
------- | ||
|
@@ -461,7 +492,10 @@ def _expand_colspan_rowspan(self, rows): | |
to subsequent cells. | ||
""" | ||
all_texts = [] # list of rows, each a list of str | ||
remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows) | ||
text: str | tuple | ||
remainder: list[ | ||
tuple[int, str | tuple, int] | ||
] = [] # list of (index, text, nrows) | ||
|
||
for tr in rows: | ||
texts = [] # the output for this row | ||
|
@@ -481,6 +515,11 @@ def _expand_colspan_rowspan(self, rows): | |
|
||
# Append the text from this <td>, colspan times | ||
text = _remove_whitespace(self._text_getter(td)) | ||
if self.extract_hrefs == "all" or self.extract_hrefs == section: | ||
# All cells will be tuples except for the headers for | ||
# consistency in selection (e.g. using .str indexing) | ||
href = self._href_getter(td) | ||
text = (text, href) if href else (text,) | ||
rowspan = int(self._attr_getter(td, "rowspan") or 1) | ||
colspan = int(self._attr_getter(td, "colspan") or 1) | ||
|
||
|
@@ -585,6 +624,10 @@ def _parse_tables(self, doc, match, attrs): | |
raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") | ||
return result | ||
|
||
def _href_getter(self, obj): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you type the args and returns of all of the added code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've typed the returns, but won't lxml/bs4 be required to type the args? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @attack68 What shall I do about this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback Sorry to bother you, but I haven't been able to come up with a solution for this. Could you please suggest how I should do it? To elaborate a bit on my first comment - the requirements may not be installed, and in that case the typing using the custom types defined in the libraries would fail (as far as I understand), so that doesn't seem like a viable solution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke Would you be able to enlighten me regarding this request? I'm still at a loss as to how to approach it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the top of the file you can do:
Then type |
||
a = obj.find("a", href=True) | ||
return None if not a else a["href"] | ||
|
||
def _text_getter(self, obj): | ||
return obj.text | ||
|
||
|
@@ -670,6 +713,10 @@ class _LxmlFrameParser(_HtmlFrameParser): | |
:class:`_HtmlFrameParser`. | ||
""" | ||
|
||
def _href_getter(self, obj): | ||
href = obj.xpath(".//a/@href") | ||
return None if not href else href[0] | ||
|
||
def _text_getter(self, obj): | ||
return obj.text_content() | ||
|
||
|
@@ -906,14 +953,14 @@ def _validate_flavor(flavor): | |
return flavor | ||
|
||
|
||
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): | ||
def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs): | ||
flavor = _validate_flavor(flavor) | ||
compiled_match = re.compile(match) # you can pass a compiled regex here | ||
|
||
retained = None | ||
for flav in flavor: | ||
parser = _parser_dispatch(flav) | ||
p = parser(io, compiled_match, attrs, encoding, displayed_only) | ||
p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs) | ||
|
||
try: | ||
tables = p.parse_tables() | ||
|
@@ -964,6 +1011,7 @@ def read_html( | |
na_values=None, | ||
keep_default_na: bool = True, | ||
displayed_only: bool = True, | ||
extract_hrefs: Literal["all", "header", "body", "footer", None] = None, | ||
) -> list[DataFrame]: | ||
r""" | ||
Read HTML tables into a ``list`` of ``DataFrame`` objects. | ||
|
@@ -1058,6 +1106,11 @@ def read_html( | |
displayed_only : bool, default True | ||
Whether elements with "display: none" should be parsed. | ||
|
||
extract_hrefs : all/header/body/footer or None, default None | ||
abmyii marked this conversation as resolved.
Show resolved
Hide resolved
abmyii marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Table elements in the specified section(s) with <a> tags will have their | ||
href extracted. Note that specifying "header" will result in a | ||
abmyii marked this conversation as resolved.
Show resolved
Hide resolved
|
||
:class:`~pandas.MultiIndex`. | ||
|
||
Returns | ||
------- | ||
dfs | ||
|
@@ -1126,4 +1179,5 @@ def read_html( | |
na_values=na_values, | ||
keep_default_na=keep_default_na, | ||
displayed_only=displayed_only, | ||
extract_hrefs=extract_hrefs, | ||
) |
Uh oh!
There was an error while loading. Please reload this page.