From 234884f95f59b8eb7145987fae17cce2a78ff145 Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Tue, 9 Jul 2024 19:35:13 -0400 Subject: [PATCH 1/7] Fix index dtype override for default c engine --- pandas/core/indexes/base.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7d43498d4267b..13c42284ff3d2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6379,9 +6379,11 @@ def _transform_index(self, func, *, level=None) -> Index: """ if isinstance(self, ABCMultiIndex): values = [ - self.get_level_values(i).map(func) - if i == level or level is None - else self.get_level_values(i) + ( + self.get_level_values(i).map(func) + if i == level or level is None + else self.get_level_values(i) + ) for i in range(self.nlevels) ] return type(self).from_arrays(values) @@ -7478,7 +7480,9 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: if len(sequences) == 1: if names is not None: names = names[0] - return Index(maybe_sequence_to_range(sequences[0]), name=names) + data = sequences[0] + dtype = data.dtype if isinstance(sequences[0], np.ndarray) else None + return Index(maybe_sequence_to_range(data), dtype=dtype, name=names) else: # TODO: Apply maybe_sequence_to_range to sequences? return MultiIndex.from_arrays(sequences, names=names) From ce4b469890dcf56578390053a361e83aab20a53a Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Tue, 9 Jul 2024 19:53:30 -0400 Subject: [PATCH 2/7] Update v3.0.0 doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cd917924880f1..7731406e2fbed 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -568,6 +568,7 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) +- Bug in :meth:`read_csv` not respecting ``dtype`` for ``index`` when using default c ``engine``. (:issue:`59077`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) From 27f30396c7cdb4b26a50aac44d5f58901461861d Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:08:35 -0400 Subject: [PATCH 3/7] Update test --- .../io/parser/dtypes/test_dtypes_basic.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index d45368dece6d2..452bd19ecbd3b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -59,18 +59,25 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): def test_dtype_per_column(all_parsers): parser = all_parsers data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" +one,two,three +1,2.5,11 +2,3.5,12 +3,4.5,13 +4,5.5,14""" expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + [[1, "2.5", 11], [2, "3.5", 12], [3, "4.5", 13], [4, "5.5", 14]], + columns=["one", "two", "three"], ) expected["one"] = expected["one"].astype(np.float64) expected["two"] = expected["two"].astype(object) + expected["three"] = expected["three"].astype(np.uint32) + expected.set_index("three", inplace=True) - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + result = parser.read_csv( + StringIO(data), + dtype={"one": np.float64, 1: str, "three": np.uint32}, + index_col="three", + ) tm.assert_frame_equal(result, expected) From dd83c3ddd88ee7804cd28beca1bff39eaa4605b5 Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Tue, 9 Jul 2024 23:29:56 -0400 Subject: [PATCH 4/7] Fix index dtype override for python engine --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/parsers/python_parser.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7731406e2fbed..39b5a810e3a8f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -568,7 +568,7 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) -- Bug in :meth:`read_csv` not respecting ``dtype`` for ``index`` when using default c ``engine``. (:issue:`59077`) +- Bug in :meth:`read_csv` not respecting ``dtype`` for ``index``. (:issue:`59077`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 05fe963e9b2b7..5add1cd7e2dbc 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -309,13 +309,13 @@ def read( data, columns = self._exclude_implicit_index(alldata) conv_data = self._convert_data(data) - conv_data = self._do_date_conversions(columns, conv_data) + date_data = self._do_date_conversions(columns, conv_data) index, result_columns = self._make_index( - conv_data, alldata, columns, indexnamerow + date_data, list(conv_data.values()), columns, indexnamerow ) - return index, result_columns, conv_data + return index, result_columns, date_data def _exclude_implicit_index( self, From eb184d548440103321c0fcb958b632e8685c1c75 Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Wed, 10 Jul 2024 00:13:38 -0400 Subject: [PATCH 5/7] Fix implicit index case for python parsing --- pandas/io/parsers/python_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 5add1cd7e2dbc..ee35b512898c2 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -311,8 +311,12 @@ def read( conv_data = self._convert_data(data) date_data = self._do_date_conversions(columns, conv_data) + if not self._implicit_index: + # propagate index dtype + alldata = list(conv_data.values()) + index, result_columns = self._make_index( - date_data, list(conv_data.values()), columns, indexnamerow + date_data, alldata, columns, indexnamerow ) return index, result_columns, date_data From 07a41abfbd9078a347c4d682d3e9dadde123b8e0 Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Wed, 10 Jul 2024 00:50:48 -0400 Subject: [PATCH 6/7] Fix implicit datetime index dtype --- pandas/core/indexes/base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 13c42284ff3d2..ffd76b4c86670 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7481,8 +7481,13 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: if names is not None: names = names[0] data = sequences[0] - dtype = data.dtype if isinstance(sequences[0], np.ndarray) else None - return Index(maybe_sequence_to_range(data), dtype=dtype, name=names) + conv_data = maybe_sequence_to_range(data) + dtype = ( + data.dtype + if isinstance(data, np.ndarray) and isinstance(conv_data, range) + else None + ) + return Index(conv_data, dtype=dtype, name=names) else: # TODO: Apply maybe_sequence_to_range to sequences? return MultiIndex.from_arrays(sequences, names=names) From ef6fc994f10046037f2aaab2eae6a6e1c1bc665b Mon Sep 17 00:00:00 2001 From: Xiaoyu <39925367+xouyang1@users.noreply.github.com> Date: Wed, 10 Jul 2024 02:14:17 -0400 Subject: [PATCH 7/7] Ignore known mypy issue for np array --- pandas/io/parsers/python_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index ee35b512898c2..e3002afd46c5d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -313,7 +313,7 @@ def read( if not self._implicit_index: # propagate index dtype - alldata = list(conv_data.values()) + alldata = list(conv_data.values()) # type: ignore[arg-type] index, result_columns = self._make_index( date_data, alldata, columns, indexnamerow