From e8fea9c8e125ad105ef4f54e0a3f9072e3e67ede Mon Sep 17 00:00:00 2001 From: Matheus Felipe Date: Sun, 15 Oct 2023 07:16:38 -0300 Subject: [PATCH] TYP/DOC: add HTMLFlavors type to read_html and related --- pandas/_typing.py | 3 +++ pandas/io/html.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index de01434c09c39..c2d51f63eb2ab 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -410,6 +410,9 @@ def closed(self) -> bool: # read_xml parsers XMLParsers = Literal["lxml", "etree"] +# read_html flavors +HTMLFlavors = Literal["lxml", "html5lib", "bs4"] + # Interval closed type IntervalLeftRight = Literal["left", "right"] IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]] diff --git a/pandas/io/html.py b/pandas/io/html.py index 68d30fe5ba681..5d5bf079784be 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -57,6 +57,7 @@ BaseBuffer, DtypeBackend, FilePath, + HTMLFlavors, ReadBuffer, StorageOptions, ) @@ -889,13 +890,13 @@ def _data_to_frame(**kwargs): } -def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]: +def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: """ Choose the parser based on the input flavor. Parameters ---------- - flavor : str + flavor : {{"lxml", "html5lib", "bs4"}} or None The type of parser to use. This must be a valid backend. Returns @@ -1033,7 +1034,7 @@ def read_html( io: FilePath | ReadBuffer[str], *, match: str | Pattern = ".+", - flavor: str | Sequence[str] | None = None, + flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, header: int | Sequence[int] | None = None, index_col: int | Sequence[int] | None = None, skiprows: int | Sequence[int] | slice | None = None, @@ -1074,7 +1075,7 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str or list-like, optional + flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional The parsing engine (or list of parsing engines) to use. 'bs4' and 'html5lib' are synonymous with each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml``