From 689db12b74cc3abac2980dcb6ce1e7a2ab416454 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 14 Oct 2022 14:30:51 +0200 Subject: [PATCH 1/5] DEP: Enforce deprecation of mangle_dup cols and convert_float in read_excel --- pandas/io/excel/_base.py | 52 +++------------------------ pandas/io/excel/_odfreader.py | 13 ++++--- pandas/io/excel/_openpyxl.py | 17 ++++----- pandas/io/excel/_pyxlsb.py | 7 ++-- pandas/io/excel/_xlrd.py | 4 +-- pandas/tests/io/excel/test_readers.py | 36 +++---------------- pandas/tests/io/excel/test_writers.py | 42 ++-------------------- 7 files changed, 30 insertions(+), 141 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f555e7c5f5d95..994887f487473 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -42,8 +42,6 @@ from pandas.errors import EmptyDataError from pandas.util._decorators import ( Appender, - deprecate_kwarg, - deprecate_nonkeyword_arguments, doc, ) from pandas.util._exceptions import find_stack_level @@ -269,23 +267,6 @@ comment string and the end of the current line is ignored. skipfooter : int, default 0 Rows at the end to skip (0-indexed). -convert_float : bool, default True - Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally. - - .. deprecated:: 1.3.0 - convert_float will be removed in a future version - -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. - - .. deprecated:: 1.5.0 - Not implemented, and a new argument to specify the pattern for the - names of duplicated columns will be added instead - {storage_options} .. versionadded:: 1.2.0 @@ -365,6 +346,7 @@ def read_excel( io, # sheet name is str or int -> DataFrame sheet_name: str | int = ..., + *, header: int | Sequence[int] | None = ..., names: list[str] | None = ..., index_col: int | Sequence[int] | None = ..., @@ -392,8 +374,6 @@ def read_excel( decimal: str = ..., comment: str | None = ..., skipfooter: int = ..., - convert_float: bool | None = ..., - mangle_dupe_cols: bool = ..., storage_options: StorageOptions = ..., ) -> DataFrame: ... @@ -404,6 +384,7 @@ def read_excel( io, # sheet name is list or None -> dict[IntStrT, DataFrame] sheet_name: list[IntStrT] | None, + *, header: int | Sequence[int] | None = ..., names: list[str] | None = ..., index_col: int | Sequence[int] | None = ..., @@ -431,20 +412,17 @@ def read_excel( decimal: str = ..., comment: str | None = ..., skipfooter: int = ..., - convert_float: bool | None = ..., - mangle_dupe_cols: bool = ..., storage_options: StorageOptions = ..., ) -> dict[IntStrT, DataFrame]: ... @doc(storage_options=_shared_docs["storage_options"]) -@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) -@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0") @Appender(_read_excel_doc) def read_excel( io, sheet_name: str | int | list[IntStrT] | None = 0, + *, header: int | Sequence[int] | None = 0, names: list[str] | None = None, index_col: int | Sequence[int] | None = None, @@ -472,8 +450,6 @@ def read_excel( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, - convert_float: bool | None = None, - mangle_dupe_cols: bool = True, storage_options: StorageOptions = None, ) -> DataFrame | dict[IntStrT, DataFrame]: @@ -511,8 +487,6 @@ def read_excel( decimal=decimal, comment=comment, skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, ) finally: # make sure to close opened file handles @@ -588,7 +562,7 @@ def get_sheet_by_index(self, index: int): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None): + def get_sheet_data(self, sheet, rows: int | None = None): pass def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -716,20 +690,9 @@ def parse( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, - convert_float: bool | None = None, - mangle_dupe_cols: bool = True, **kwds, ): - if convert_float is None: - convert_float = True - else: - warnings.warn( - "convert_float is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - validate_header_arg(header) validate_integer("nrows", nrows) @@ -763,7 +726,7 @@ def parse( sheet = self.get_sheet_by_index(asheetname) file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) - data = self.get_sheet_data(sheet, convert_float, file_rows_needed) + data = self.get_sheet_data(sheet, file_rows_needed) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() @@ -885,7 +848,6 @@ def parse( comment=comment, skipfooter=skipfooter, usecols=usecols, - mangle_dupe_cols=mangle_dupe_cols, **kwds, ) @@ -1718,8 +1680,6 @@ def parse( thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, - convert_float: bool | None = None, - mangle_dupe_cols: bool = True, **kwds, ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ @@ -1751,8 +1711,6 @@ def parse( thousands=thousands, comment=comment, skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, **kwds, ) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 075590f3535fe..8d2434e96ca61 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar | NaTType]]: """ Parse an ODF Table into a list of lists @@ -122,7 +122,7 @@ def get_sheet_data( for sheet_cell in sheet_cells: if sheet_cell.qname == table_cell_name: - value = self._get_cell_value(sheet_cell, convert_float) + value = self._get_cell_value(sheet_cell) else: value = self.empty_value @@ -183,7 +183,7 @@ def _is_empty_row(self, row) -> bool: return True - def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType: + def _get_cell_value(self, cell) -> Scalar | NaTType: from odf.namespaces import OFFICENS if str(cell) == "#N/A": @@ -199,10 +199,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType: elif cell_type == "float": # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if convert_float: - val = int(cell_value) - if val == cell_value: - return val + val = int(cell_value) + if val == cell_value: + return val return cell_value elif cell_type == "percentage": cell_value = cell.attributes.get((OFFICENS, "value")) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6fde319b3a81e..5572116ca29fe 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -581,7 +581,7 @@ def get_sheet_by_index(self, index: int): self.raise_if_bad_sheet_by_index(index) return self.book.worksheets[index] - def _convert_cell(self, cell, convert_float: bool) -> Scalar: + def _convert_cell(self, cell) -> Scalar: from openpyxl.cell.cell import ( TYPE_ERROR, @@ -593,18 +593,15 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_NUMERIC: - # GH5394, GH46988 - if convert_float: - val = int(cell.value) - if val == cell.value: - return val - else: - return float(cell.value) + val = int(cell.value) + if val == cell.value: + return val + return float(cell.value) return cell.value def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: if self.book.read_only: @@ -613,7 +610,7 @@ def get_sheet_data( data: list[list[Scalar]] = [] last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): - converted_row = [self._convert_cell(cell, convert_float) for cell in row] + converted_row = [self._convert_cell(cell) for cell in row] while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 5d40ccdf2f8f3..634baee63137e 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -65,12 +65,12 @@ def get_sheet_by_index(self, index: int): # There's a fix for this in the source, but the pypi package doesn't have it return self.book.get_sheet(index + 1) - def _convert_cell(self, cell, convert_float: bool) -> Scalar: + def _convert_cell(self, cell) -> Scalar: # TODO: there is no way to distinguish between floats and datetimes in pyxlsb # This means that there is no way to read datetime types from an xlsb file yet if cell.v is None: return "" # Prevents non-named columns from not showing up as Unnamed: i - if isinstance(cell.v, float) and convert_float: + if isinstance(cell.v, float): val = int(cell.v) if val == cell.v: return val @@ -82,7 +82,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: def get_sheet_data( self, sheet, - convert_float: bool, file_rows_needed: int | None = None, ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] @@ -91,7 +90,7 @@ def get_sheet_data( # not returned. The cells are namedtuples of row, col, value (r, c, v). for row in sheet.rows(sparse=True): row_number = row[0].r - converted_row = [self._convert_cell(cell, convert_float) for cell in row] + converted_row = [self._convert_cell(cell) for cell in row] while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 0bf3ac6134cf6..171705dee6e59 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -62,7 +62,7 @@ def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, @@ -104,7 +104,7 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) - elif convert_float and cell_typ == XL_CELL_NUMBER: + elif cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising val = int(cell_contents) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fa1d6bbfd5a7e..f018ba7a45af9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -405,7 +405,6 @@ def test_reader_special_dtypes(self, request, read_ext): "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005], "BoolCol": [True, False, True, True, False], "StrCol": [1, 2, 3, 4, 5], - # GH5394 - this is why convert_float isn't vectorized "Str2Col": ["a", 3, "c", "d", "e"], "DateCol": [ datetime(2013, 10, 30), @@ -424,19 +423,11 @@ def test_reader_special_dtypes(self, request, read_ext): # if not coercing number, then int comes in as float float_expected = expected.copy() - float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - with tm.assert_produces_warning( - FutureWarning, - match="convert_float is deprecated", - raise_on_extra_warnings=False, - ): - # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning - # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) - # See GH#41176 - actual = pd.read_excel( - basename + read_ext, sheet_name="Sheet1", convert_float=False - ) + # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning + # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) + # See GH#41176 + actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) @@ -447,31 +438,12 @@ def test_reader_special_dtypes(self, request, read_ext): exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) - # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str} ) tm.assert_frame_equal(actual, expected) - no_convert_float = float_expected.copy() - no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - with tm.assert_produces_warning( - FutureWarning, - match="convert_float is deprecated", - raise_on_extra_warnings=False, - ): - # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning - # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) - # See GH#41176 - actual = pd.read_excel( - basename + read_ext, - sheet_name="Sheet1", - convert_float=False, - converters={"StrCol": str}, - ) - tm.assert_frame_equal(actual, no_convert_float) - # GH8212 - support for converters and missing values def test_reader_converters(self, read_ext): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index d4b74ddbd66e0..ee3e71aa04772 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -471,18 +471,6 @@ def test_int_types(self, np_type, path): recons2 = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) - # Test with convert_float=False comes back as float. - float_frame = df.astype(float) - float_frame.columns = float_frame.columns.astype(float) - float_frame.index = float_frame.index.astype(float) - with tm.assert_produces_warning( - FutureWarning, match="convert_float is deprecated" - ): - recons = pd.read_excel( - path, sheet_name="test1", convert_float=False, index_col=0 - ) - tm.assert_frame_equal(recons, float_frame) - @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, path): # Test np.float values read come back as float. @@ -972,15 +960,6 @@ def test_duplicated_columns(self, path): result = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(result, expected) - # Explicitly, we pass in the parameter. - with tm.assert_produces_warning( - FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated" - ): - result = pd.read_excel( - path, sheet_name="test1", index_col=0, mangle_dupe_cols=True - ) - tm.assert_frame_equal(result, expected) - # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(path, "test1") @@ -998,15 +977,6 @@ def test_duplicated_columns(self, path): expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) - msg = "Setting mangle_dupe_cols=False is not supported yet" - with tm.assert_produces_warning( - FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated" - ): - with pytest.raises(ValueError, match=msg): - pd.read_excel( - path, sheet_name="test1", header=None, mangle_dupe_cols=False - ) - def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) @@ -1212,21 +1182,15 @@ def test_merged_cell_custom_objects(self, path): (pd.Period("2018"), pd.Period("2018Q2")), ] ) - expected = DataFrame(np.ones((2, 2)), columns=mi) + expected = DataFrame(np.ones((2, 2), dtype=np.int_), columns=mi) expected.to_excel(path) - with tm.assert_produces_warning( - FutureWarning, match="convert_float is deprecated" - ): - result = pd.read_excel( - path, header=[0, 1], index_col=0, convert_float=False - ) + result = pd.read_excel(path, header=[0, 1], index_col=0) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], level=[0, 1], ) - expected.index = expected.index.astype(np.float64) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): From 5d9a954a888fde07730ebaa233ced5453f3000af Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 14 Oct 2022 14:47:56 +0200 Subject: [PATCH 2/5] Remove test --- pandas/tests/io/excel/test_readers.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f018ba7a45af9..5354a17aa5066 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1247,12 +1247,6 @@ def test_read_excel_squeeze(self, read_ext): expected = Series([1, 2, 3], name="a") tm.assert_series_equal(actual, expected) - def test_deprecated_kwargs(self, read_ext): - with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): - pd.read_excel("test1" + read_ext, "Sheet1", 0) - - pd.read_excel("test1" + read_ext) - def test_no_header_with_list_index_col(self, read_ext): # GH 31783 file_name = "testmultiindex" + read_ext From bc81cd85865dae85a109a00529cd4e45f0004731 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 14 Oct 2022 15:30:37 +0200 Subject: [PATCH 3/5] ENH: Add use nullable dtypes to read_excel --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/_libs/lib.pyx | 4 +- pandas/io/excel/_base.py | 15 +++++ pandas/io/parsers/base_parser.py | 5 +- pandas/tests/io/excel/test_readers.py | 82 +++++++++++++++++++++++++++ 5 files changed, 105 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 508d5d8bc4cc1..afd2954ce3e00 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -32,7 +32,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) -- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` to enable automatic conversion to nullable dtypes (:issue:`36712`) +- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d9a7195520fd7..a4b173e85e964 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2366,7 +2366,7 @@ def maybe_convert_numeric( # This occurs since we disabled float nulls showing as null in anticipation # of seeing ints that were never seen. So then, we return float - if allow_null_in_int and seen.null_ and not seen.int_: + if allow_null_in_int and seen.null_ and not seen.int_ and not seen.bool_: seen.float_ = True if seen.complex_: @@ -2386,6 +2386,8 @@ def maybe_convert_numeric( else: return (ints, None) elif seen.bool_: + if allow_null_in_int: + return (bools.view(np.bool_), mask.view(np.bool_)) return (bools.view(np.bool_), None) elif seen.uint_: return (uints, None) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 994887f487473..2ed4fa6f59206 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -267,6 +267,13 @@ comment string and the end of the current line is ignored. skipfooter : int, default 0 Rows at the end to skip (0-indexed). +use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. versionadded:: 2.0 + {storage_options} .. versionadded:: 1.2.0 @@ -375,6 +382,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame: ... @@ -413,6 +421,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> dict[IntStrT, DataFrame]: ... @@ -451,6 +460,7 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False @@ -487,6 +497,7 @@ def read_excel( decimal=decimal, comment=comment, skipfooter=skipfooter, + use_nullable_dtypes=use_nullable_dtypes, ) finally: # make sure to close opened file handles @@ -690,6 +701,7 @@ def parse( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, + use_nullable_dtypes: bool = False, **kwds, ): @@ -848,6 +860,7 @@ def parse( comment=comment, skipfooter=skipfooter, usecols=usecols, + use_nullable_dtypes=use_nullable_dtypes, **kwds, ) @@ -1680,6 +1693,7 @@ def parse( thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, + use_nullable_dtypes: bool = False, **kwds, ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ @@ -1711,6 +1725,7 @@ def parse( thousands=thousands, comment=comment, skipfooter=skipfooter, + use_nullable_dtypes=use_nullable_dtypes, **kwds, ) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 45f6469a31f4f..9dc743e166c40 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -776,7 +776,10 @@ def _infer_types( bool_mask = np.zeros(result.shape, dtype=np.bool_) result = BooleanArray(result, bool_mask) elif result.dtype == np.object_ and use_nullable_dtypes: - result = StringDtype().construct_array_type()._from_sequence(values) + # read_excel sends array of datetime objects + inferred_type, _ = lib.infer_datetimelike_array(result) + if inferred_type != "datetime": + result = StringDtype().construct_array_type()._from_sequence(values) return result, na_count diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f018ba7a45af9..22bf2deac1a5a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -21,6 +21,10 @@ Series, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -535,6 +539,84 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + def test_use_nullable_dtypes(self, read_ext): + # GH#36712 + if read_ext == ".xlsb": + pytest.skip("No engine for filetype: 'xlsb'") + + df = DataFrame( + { + "a": Series([1, 3], dtype="Int64"), + "b": Series([2.5, 4.5], dtype="Float64"), + "c": Series([True, False], dtype="boolean"), + "d": Series(["a", "b"], dtype="string"), + "e": Series([pd.NA, 6], dtype="Int64"), + "f": Series([pd.NA, 7.5], dtype="Float64"), + "g": Series([pd.NA, True], dtype="boolean"), + "h": Series([pd.NA, "a"], dtype="string"), + "i": Series([pd.Timestamp("2019-12-31")] * 2), + "j": Series([pd.NA, pd.NA], dtype="Int64"), + } + ) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, "test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + tm.assert_frame_equal(result, df) + + def test_use_nullabla_dtypes_and_dtype(self, read_ext): + # GH#36712 + if read_ext == ".xlsb": + pytest.skip("No engine for filetype: 'xlsb'") + + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, "test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True, dtype="float64" + ) + tm.assert_frame_equal(result, df) + + @td.skip_if_no("pyarrow") + @pytest.mark.parametrize("storage", ["pyarrow", "python"]) + def test_use_nullabla_dtypes_string(self, read_ext, storage): + # GH#36712 + if read_ext == ".xlsb": + pytest.skip("No engine for filetype: 'xlsb'") + + import pyarrow as pa + + with pd.option_context("mode.string_storage", storage): + + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, "test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + + if storage == "python": + expected = DataFrame( + { + "a": StringArray(np.array(["a", "b"], dtype=np.object_)), + "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), + } + ) + else: + expected = DataFrame( + { + "a": ArrowStringArray(pa.array(["a", "b"])), + "b": ArrowStringArray(pa.array(["x", None])), + } + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): # GH#35211 From fc155032a88d7c223ef4a62d5fb76c9948b66dd7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 14 Oct 2022 16:10:48 +0200 Subject: [PATCH 4/5] Move parameters --- pandas/io/excel/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2ed4fa6f59206..0f414a24da850 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -267,6 +267,10 @@ comment string and the end of the current line is ignored. skipfooter : int, default 0 Rows at the end to skip (0-indexed). +{storage_options} + + .. versionadded:: 1.2.0 + use_nullable_dtypes : bool = False Whether or not to use nullable dtypes as default when reading data. If set to True, nullable dtypes are used for all dtypes that have a nullable @@ -274,10 +278,6 @@ .. versionadded:: 2.0 -{storage_options} - - .. versionadded:: 1.2.0 - Returns ------- DataFrame or dict of DataFrames From 29607affd65d7d0d88e878c142d6766bd4a37632 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 24 Oct 2022 23:40:40 +0200 Subject: [PATCH 5/5] Fix docstring --- pandas/io/excel/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0f414a24da850..5698c1a5af0e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -271,10 +271,10 @@ .. versionadded:: 1.2.0 -use_nullable_dtypes : bool = False +use_nullable_dtypes : bool, default False Whether or not to use nullable dtypes as default when reading data. If set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. + implementation, even if no nulls are present. Dtype takes precedence if given. .. versionadded:: 2.0