diff --git a/pandas/io/html.py b/pandas/io/html.py index c4ffe332e3020..3193f52d239f1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,7 +8,9 @@ import numbers import os import re +from typing import Dict, List, Optional, Pattern, Sequence, Union +from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError, EmptyDataError from pandas.util._decorators import deprecate_nonkeyword_arguments @@ -16,6 +18,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.frame import DataFrame from pandas.io.common import is_url, urlopen, validate_header_arg from pandas.io.formats.printing import pprint_thing @@ -924,22 +927,22 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): @deprecate_nonkeyword_arguments(version="2.0") def read_html( - io, - match=".+", - flavor=None, - header=None, - index_col=None, - skiprows=None, - attrs=None, - parse_dates=False, - thousands=",", - encoding=None, - decimal=".", - converters=None, + io: FilePathOrBuffer, + match: Union[str, Pattern] = ".+", + flavor: Optional[str] = None, + header: Optional[Union[int, Sequence[int]]] = None, + index_col: Optional[Union[int, Sequence[int]]] = None, + skiprows: Optional[Union[int, Sequence[int], slice]] = None, + attrs: Optional[Dict[str, str]] = None, + parse_dates: bool = False, + thousands: Optional[str] = ",", + encoding: Optional[str] = None, + decimal: str = ".", + converters: Optional[Dict] = None, na_values=None, - keep_default_na=True, - displayed_only=True, -): + keep_default_na: bool = True, + displayed_only: bool = True, +) -> List[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -958,26 +961,26 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str or None + flavor : str, optional The parsing engine to use. 'bs4' and 'html5lib' are synonymous with each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. - header : int or list-like or None, optional + header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to make the columns headers. - index_col : int or list-like or None, optional + index_col : int or list-like, optional The column (or list of columns) to use to create the index. - skiprows : int or list-like or slice or None, optional + skiprows : int, list-like or slice, optional Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth row' whereas an integer means 'skip n rows'. - attrs : dict or None, optional + attrs : dict, optional This is a dictionary of attributes that you can pass to use to identify the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be @@ -1005,7 +1008,7 @@ def read_html( thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. - encoding : str or None, optional + encoding : str, optional The encoding used to decode the web page. Defaults to ``None``.``None`` preserves the previous encoding behavior, which depends on the underlying parser library (e.g., the parser library will try to use