From b5a3dc299c9f8eef93916f8b41b9ecd0ffd150e0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 4 Feb 2023 20:20:48 -0500 Subject: [PATCH 1/3] unpin pyarrow, fix failing test --- .github/actions/setup-conda/action.yml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- environment.yml | 2 +- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/core/arrays/arrow/array.py | 13 +++++++++++-- pandas/core/tools/timedeltas.py | 4 +++- pandas/tests/extension/test_arrow.py | 3 ++- requirements-dev.txt | 2 +- 14 files changed, 29 insertions(+), 13 deletions(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index f4b1d9e49f63a..002d0020c2df1 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -18,7 +18,7 @@ runs: - name: Set Arrow version in ${{ inputs.environment-file }} to ${{ inputs.pyarrow-version }} run: | grep -q ' - pyarrow' ${{ inputs.environment-file }} - sed -i"" -e "s/ - pyarrow<11/ - pyarrow=${{ inputs.pyarrow-version }}/" ${{ inputs.environment-file }} + sed -i"" -e "s/ - pyarrow/ - pyarrow=${{ inputs.pyarrow-version }}/" ${{ inputs.environment-file }} cat ${{ inputs.environment-file }} shell: bash if: ${{ inputs.pyarrow-version }} diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 25032ed1c76b0..b500cf66b10c2 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -42,7 +42,7 @@ dependencies: - psycopg2 - pymysql - pytables - - pyarrow<11 + - pyarrow - pyreadstat - python-snappy - pyxlsb diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index aef97c232e940..8e15c7b4740c5 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -42,7 +42,7 @@ dependencies: - psycopg2 - pymysql # - pytables>=3.8.0 # first version that supports 3.11 - - pyarrow<11 + - pyarrow - pyreadstat - python-snappy - pyxlsb diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 1de392a9cc277..151cabfacb434 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -40,7 +40,7 @@ dependencies: - openpyxl - odfpy - psycopg2 - - pyarrow<11 + - pyarrow - pymysql - pyreadstat - pytables diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 803b0bdbff793..bd1246ddc7a3e 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -40,7 +40,7 @@ dependencies: - odfpy - pandas-gbq - psycopg2 - - pyarrow<11 + - pyarrow - pymysql - pyreadstat - pytables diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 5ce5681aa9e21..93b73b20591b0 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -41,7 +41,7 @@ dependencies: - pandas-gbq - psycopg2 - pymysql - - pyarrow<11 + - pyarrow - pyreadstat - pytables - python-snappy diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 7dcb84dc8874c..addbda194cc0c 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -40,7 +40,7 @@ dependencies: - odfpy - pandas-gbq - psycopg2 - - pyarrow<11 + - pyarrow - pymysql # Not provided on ARM #- pyreadstat diff --git a/environment.yml b/environment.yml index 076e6fa727332..47627fcac32e1 100644 --- a/environment.yml +++ b/environment.yml @@ -43,7 +43,7 @@ dependencies: - odfpy - py - psycopg2 - - pyarrow<11 + - pyarrow - pymysql - pyreadstat - pytables diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b59b9632913e4..b4110baf40c9d 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ pa_version_under7p0, pa_version_under8p0, pa_version_under9p0, + pa_version_under11p0, ) @@ -161,6 +162,7 @@ def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: "pa_version_under7p0", "pa_version_under8p0", "pa_version_under9p0", + "pa_version_under11p0", "IS64", "PY39", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 280fdabf2cc05..23803c2e96a15 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -14,9 +14,11 @@ pa_version_under8p0 = _palv < Version("8.0.0") pa_version_under9p0 = _palv < Version("9.0.0") pa_version_under10p0 = _palv < Version("10.0.0") + pa_version_under11p0 = _palv < Version("11.0.0") except ImportError: pa_version_under6p0 = True pa_version_under7p0 = True pa_version_under8p0 = True pa_version_under9p0 = True pa_version_under10p0 = True + pa_version_under11p0 = True diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9247d26fc846d..363d9a4d0c0cf 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -258,9 +258,18 @@ def _from_sequence_of_strings( scalars = to_datetime(strings, errors="raise").date elif pa.types.is_duration(pa_type): - from pandas.core.tools.timedeltas import to_timedelta + try: + # GH51146 + # attempt to parse as integers reflecting pyarrow's + # duration to string casting behavior: + # pa.scalar(1, pa.duration("s")).cast(pa.string()).as_py() == "1" + scalars = np.array(strings, dtype=object) + scalars = pa.array(scalars, type=pa.string(), from_pandas=True) + scalars = scalars.cast(pa.int64()) + except pa.ArrowInvalid: + from pandas.core.tools.timedeltas import to_timedelta - scalars = to_timedelta(strings, errors="raise") + scalars = to_timedelta(strings, errors="raise") elif pa.types.is_time(pa_type): from pandas.core.tools.times import to_time diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index b968004846e8e..42cf92c6b2a35 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -240,7 +240,9 @@ def _convert_listlike( # returning arg (errors == "ignore"), and where the input is a # generator, we return a useful list-like instead of a # used-up generator - arg = np.array(list(arg), dtype=object) + if not hasattr(arg, "__array__"): + arg = list(arg) + arg = np.array(arg, dtype=object) try: td64arr = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e31d8605eeb06..519596501257d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -33,6 +33,7 @@ pa_version_under7p0, pa_version_under8p0, pa_version_under9p0, + pa_version_under11p0, ) from pandas.errors import PerformanceWarning @@ -295,7 +296,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason="Nanosecond time parsing not supported.", ) ) - elif pa.types.is_duration(pa_dtype): + elif pa_version_under11p0 and pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, diff --git a/requirements-dev.txt b/requirements-dev.txt index 04d8b176dffae..112f90222427b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ openpyxl odfpy py psycopg2-binary -pyarrow<11 +pyarrow pymysql pyreadstat tables From 5bc178fd7feab0370b4d76e07d3170139ef2a3db Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 5 Feb 2023 08:38:07 -0500 Subject: [PATCH 2/3] cleanup --- pandas/core/arrays/arrow/array.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 363d9a4d0c0cf..60787a3a6dfb4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -259,12 +259,13 @@ def _from_sequence_of_strings( scalars = to_datetime(strings, errors="raise").date elif pa.types.is_duration(pa_type): try: - # GH51146 - # attempt to parse as integers reflecting pyarrow's - # duration to string casting behavior: - # pa.scalar(1, pa.duration("s")).cast(pa.string()).as_py() == "1" - scalars = np.array(strings, dtype=object) - scalars = pa.array(scalars, type=pa.string(), from_pandas=True) + # GH51175: test_from_sequence_of_strings_pa_array + # attempt to parse as int64 reflecting pyarrow's + # duration to string casting behavior + if isinstance(strings, (pa.Array, pa.ChunkedArray)): + scalars = strings + else: + scalars = pa.array(strings, type=pa.string(), from_pandas=True) scalars = scalars.cast(pa.int64()) except pa.ArrowInvalid: from pandas.core.tools.timedeltas import to_timedelta From 2fadab1e020e7c3c47742fce520edd7b6b8a6d0d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 7 Feb 2023 20:08:47 -0500 Subject: [PATCH 3/3] handle NaT/NaN --- pandas/core/arrays/arrow/array.py | 22 ++++++++++++---------- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e195b472620a1..ba4ea80d006c8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -257,19 +257,21 @@ def _from_sequence_of_strings( scalars = to_datetime(strings, errors="raise").date elif pa.types.is_duration(pa_type): - try: + from pandas.core.tools.timedeltas import to_timedelta + + scalars = to_timedelta(strings, errors="raise") + if pa_type.unit != "ns": # GH51175: test_from_sequence_of_strings_pa_array # attempt to parse as int64 reflecting pyarrow's # duration to string casting behavior - if isinstance(strings, (pa.Array, pa.ChunkedArray)): - scalars = strings - else: - scalars = pa.array(strings, type=pa.string(), from_pandas=True) - scalars = scalars.cast(pa.int64()) - except pa.ArrowInvalid: - from pandas.core.tools.timedeltas import to_timedelta - - scalars = to_timedelta(strings, errors="raise") + mask = isna(scalars) + if not isinstance(strings, (pa.Array, pa.ChunkedArray)): + strings = pa.array(strings, type=pa.string(), from_pandas=True) + strings = pc.if_else(mask, None, strings) + try: + scalars = strings.cast(pa.int64()) + except pa.ArrowInvalid: + pass elif pa.types.is_time(pa_type): from pandas.core.tools.times import to_time diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 887e0f77614d2..d21025f5093e2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1600,3 +1600,13 @@ def test_searchsorted_with_na_raises(data_for_sorting, as_series): ) with pytest.raises(ValueError, match=msg): arr.searchsorted(b) + + +@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) +def test_duration_from_strings_with_nat(unit): + # GH51175 + strings = ["1000", "NaT"] + pa_type = pa.duration(unit) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa_type) + expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type)) + tm.assert_extension_array_equal(result, expected)