diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 381a05a18b278..212e1f21d984d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -322,6 +322,7 @@ I/O - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`) - Bug in :meth:`~HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`) - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) +- Bug in :func:`read_csv` apllying thousands separator to date columns when column should be parsed for dates and ``usecols`` is specified for ``engine="python"`` (:issue:`39365`) - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`) - :func:`read_excel` now respects :func:`set_option` (:issue:`34252`) - Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 64c3b1e64a659..0d23addbb5f21 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -172,6 +172,8 @@ def __init__(self, kwds): self._first_chunk = True + self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.handles: Optional[IOHandles] = None def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: @@ -546,6 +548,74 @@ def _convert_to_ndarrays( print(f"Filled {na_count} NA values in column {c!s}") return result + def _set_noconvert_dtype_columns( + self, col_indices: List[int], names: List[Union[int, str]] + ) -> Set[int]: + """ + Set the columns that should not undergo dtype conversions. + + Currently, any column that is involved with date parsing will not + undergo such conversions. If usecols is specified, the positions of the columns + not to cast is relative to the usecols not to all columns. + + Parameters + ---------- + col_indices: The indices specifying order and positions of the columns + names: The column names which order is corresponding with the order + of col_indices + + Returns + ------- + A set of integers containing the positions of the columns not to convert. + """ + usecols: Optional[Union[List[int], List[str]]] + noconvert_columns = set() + if self.usecols_dtype == "integer": + # A set of integers will be converted to a list in + # the correct order every single time. + usecols = sorted(self.usecols) + elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): + # The names attribute should have the correct columns + # in the proper order for indexing with parse_dates. + usecols = col_indices + else: + # Usecols is empty. + usecols = None + + def _set(x) -> int: + if usecols is not None and is_integer(x): + x = usecols[x] + + if not is_integer(x): + x = col_indices[names.index(x)] + + return x + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + noconvert_columns.add(_set(k)) + else: + noconvert_columns.add(_set(val)) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + noconvert_columns.add(_set(k)) + else: + noconvert_columns.add(_set(val)) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + noconvert_columns.add(_set(k)) + elif self.index_col is not None: + noconvert_columns.add(_set(self.index_col)) + + return noconvert_columns + def _infer_types(self, values, na_values, try_num_bool=True): """ Infer types of values, possibly casting diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index b4c00dfe9b3e7..9bd3bc9fb5c62 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,8 +1,6 @@ import pandas._libs.parsers as parsers from pandas._typing import FilePathOrBuffer -from pandas.core.dtypes.common import is_integer - from pandas.core.indexes.api import ensure_index_from_sequences from pandas.io.parsers.base_parser import ParserBase, is_index_col @@ -19,7 +17,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds): kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) kwds["usecols"] = self.usecols # open handles @@ -159,58 +156,11 @@ def _set_noconvert_columns(self): Currently, any column that is involved with date parsing will not undergo such conversions. """ - names = self.orig_names - if self.usecols_dtype == "integer": - # A set of integers will be converted to a list in - # the correct order every single time. - usecols = list(self.usecols) - usecols.sort() - elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): - # The names attribute should have the correct columns - # in the proper order for indexing with parse_dates. - usecols = self.names[:] - else: - # Usecols is empty. - - # pandas\io\parsers.py:2030: error: Incompatible types in - # assignment (expression has type "None", variable has type - # "List[Any]") [assignment] - usecols = None # type: ignore[assignment] - - def _set(x): - if usecols is not None and is_integer(x): - x = usecols[x] - - if not is_integer(x): - # assert for mypy, names is List or None, None would error when calling - # .index() - assert names is not None - x = names.index(x) - - self._reader.set_noconvert(x) - - if isinstance(self.parse_dates, list): - for val in self.parse_dates: - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif self.parse_dates: - if isinstance(self.index_col, list): - for k in self.index_col: - _set(k) - elif self.index_col is not None: - _set(self.index_col) + assert self.orig_names is not None + col_indices = [self.orig_names.index(x) for x in self.names] + noconvert_columns = self._set_noconvert_dtype_columns(col_indices, self.names) + for col in noconvert_columns: + self._reader.set_noconvert(col) def set_error_bad_lines(self, status): self._reader.set_error_bad_lines(int(status)) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index ba05eb4a6599f..c005f69e3c04e 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -3,7 +3,7 @@ from io import StringIO import re import sys -from typing import Iterator, List, Optional, cast +from typing import Iterator, List, Optional, Set, cast import numpy as np @@ -53,7 +53,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self.skipinitialspace = kwds["skipinitialspace"] self.lineterminator = kwds["lineterminator"] self.quoting = kwds["quoting"] - self.usecols, _ = self._validate_usecols_arg(kwds["usecols"]) self.skip_blank_lines = kwds["skip_blank_lines"] self.warn_bad_lines = kwds["warn_bad_lines"] @@ -136,10 +135,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self._col_indices = list(range(len(self.columns))) self._validate_parse_dates_presence(self.columns) + no_thousands_columns: Optional[Set[int]] = None if self.parse_dates: - self._no_thousands_columns = self._set_no_thousands_columns() - else: - self._no_thousands_columns = None + no_thousands_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.columns + ) + self._no_thousands_columns = no_thousands_columns if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") @@ -155,44 +156,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): ) self.num = re.compile(regex) - def _set_no_thousands_columns(self): - # Create a set of column ids that are not to be stripped of thousands - # operators. - noconvert_columns = set() - - def _set(x): - if is_integer(x): - noconvert_columns.add(x) - else: - assert self._col_indices is not None - col_indices = self._col_indices - noconvert_columns.add(col_indices[self.columns.index(x)]) - - if isinstance(self.parse_dates, list): - for val in self.parse_dates: - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif self.parse_dates: - if isinstance(self.index_col, list): - for k in self.index_col: - _set(k) - elif self.index_col is not None: - _set(self.index_col) - - return noconvert_columns - def _make_reader(self, f): sep = self.delimiter diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index eb640e324e676..25d98928f1a6b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers): result = parser.read_csv(data, parse_dates=["B"], names=["B"]) expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) tm.assert_frame_equal(result, expected) + + +def test_date_parser_usecols_thousands(all_parsers): + # GH#39365 + data = """A,B,C + 1,3,20-09-01-01 + 2,4,20-09-01-01 + """ + + parser = all_parsers + result = parser.read_csv( + StringIO(data), + parse_dates=[1], + usecols=[1, 2], + thousands="-", + ) + expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) + tm.assert_frame_equal(result, expected)