From d0263c996739588a561b0f80ce3f40cad6a26860 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 22 Jan 2018 16:09:20 -0800 Subject: [PATCH 1/7] implement io_helper --- pandas/_libs/io_helper.pyx | 233 +++++++++++++++++++++++++++++++++ pandas/_libs/lib.pyx | 203 +--------------------------- pandas/_libs/src/inference.pyx | 44 ++----- pandas/io/formats/format.py | 5 +- pandas/io/json/normalize.py | 2 +- pandas/io/parsers.py | 5 +- pandas/io/pytables.py | 8 +- pandas/io/stata.py | 3 +- pandas/tests/dtypes/test_io.py | 73 ----------- pandas/tests/test_lib.py | 10 +- setup.py | 4 + 11 files changed, 264 insertions(+), 326 deletions(-) create mode 100644 pandas/_libs/io_helper.pyx delete mode 100644 pandas/tests/dtypes/test_io.py diff --git a/pandas/_libs/io_helper.pyx b/pandas/_libs/io_helper.pyx new file mode 100644 index 0000000000000..aa9af96c1bd6c --- /dev/null +++ b/pandas/_libs/io_helper.pyx @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- + +cimport cython +from cython cimport Py_ssize_t + +from cpython cimport (PyString_Check, PyBytes_Check, PyUnicode_Check, + PyBytes_GET_SIZE, PyUnicode_GET_SIZE) + +try: + from cpython cimport PyString_GET_SIZE +except ImportError: + from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE + +import numpy as np +cimport numpy as cnp +from numpy cimport ndarray, uint8_t +cnp.import_array() + +cimport util + + +ctypedef fused pandas_string: + str + unicode + bytes + + +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): + cdef: + Py_ssize_t i, n + object val, onan + Py_ssize_t na_count = 0 + dict memo = {} + + n = len(values) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + if (convert_empty and val == '') or (val in na_values): + values[i] = onan + na_count += 1 + elif val in memo: + values[i] = memo[val] + else: + memo[val] = val + + return na_count + + +@cython.boundscheck(False) +@cython.wraparound(False) +def write_csv_rows(list data, ndarray data_index, + int nlevels, ndarray cols, object writer): + + cdef int N, j, i, ncols + cdef list rows + cdef object val + + # In crude testing, N>100 yields little marginal improvement + N=100 + + # pre-allocate rows + ncols = len(cols) + rows = [[None] * (nlevels + ncols) for x in range(N)] + + j = -1 + if nlevels == 1: + for j in range(len(data_index)): + row = rows[j % N] + row[0] = data_index[j] + for i in range(ncols): + row[1 + i] = data[i][j] + + if j >= N - 1 and j % N == N - 1: + writer.writerows(rows) + elif nlevels > 1: + for j in range(len(data_index)): + row = rows[j % N] + row[:nlevels] = list(data_index[j]) + for i in range(ncols): + row[nlevels + i] = data[i][j] + + if j >= N - 1 and j % N == N - 1: + writer.writerows(rows) + else: + for j in range(len(data_index)): + row = rows[j % N] + for i in range(ncols): + row[i] = data[i][j] + + if j >= N - 1 and j % N == N - 1: + writer.writerows(rows) + + if j >= 0 and (j < N - 1 or (j % N) != N - 1): + writer.writerows(rows[:((j + 1) % N)]) + + +@cython.boundscheck(False) +@cython.wraparound(False) +def convert_json_to_lines(object arr): + """ + replace comma separated json with line feeds, paying special attention + to quotes & brackets + """ + cdef: + Py_ssize_t i = 0, num_open_brackets_seen = 0, length + bint in_quotes = 0, is_escaping = 0 + ndarray[uint8_t] narr + unsigned char v, comma, left_bracket, right_brack, newline + + newline = ord('\n') + comma = ord(',') + left_bracket = ord('{') + right_bracket = ord('}') + quote = ord('"') + backslash = ord('\\') + + narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy() + length = narr.shape[0] + for i in range(length): + v = narr[i] + if v == quote and i > 0 and not is_escaping: + in_quotes = ~in_quotes + if v == backslash or is_escaping: + is_escaping = ~is_escaping + if v == comma: # commas that should be \n + if num_open_brackets_seen == 0 and not in_quotes: + narr[i] = newline + elif v == left_bracket: + if not in_quotes: + num_open_brackets_seen += 1 + elif v == right_bracket: + if not in_quotes: + num_open_brackets_seen -= 1 + + return narr.tostring().decode('utf-8') + + +# stata, pytables +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): + """ return the maximum size of elements in a 1-dim string array """ + cdef: + Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] + pandas_string v + + for i in range(length): + v = arr[i] + if PyString_Check(v): + l = PyString_GET_SIZE(v) + elif PyBytes_Check(v): + l = PyBytes_GET_SIZE(v) + elif PyUnicode_Check(v): + l = PyUnicode_GET_SIZE(v) + + if l > m: + m = l + + return m + + +# ------------------------------------------------------------------ +# PyTables Helpers + + +@cython.boundscheck(False) +@cython.wraparound(False) +def string_array_replace_from_nan_rep( + ndarray[object, ndim=1] arr, object nan_rep, + object replace=None): + """ + Replace the values in the array with 'replacement' if + they are 'nan_rep'. Return the same array. + """ + + cdef int length = arr.shape[0], i = 0 + if replace is None: + replace = np.nan + + for i from 0 <= i < length: + if arr[i] == nan_rep: + arr[i] = replace + + return arr + + +def convert_timestamps(ndarray values): + cdef: + object val, f, result + dict cache = {} + Py_ssize_t i, n = len(values) + ndarray[object] out + + # for HDFStore, a bit temporary but... + + from datetime import datetime + f = datetime.fromtimestamp + + out = np.empty(n, dtype='O') + + for i in range(n): + val = util.get_value_1d(values, i) + if val in cache: + out[i] = cache[val] + else: + cache[val] = out[i] = f(val) + + return out + + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < n: + val = values[i] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1632f5d016439..e31bc950fc0d5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -17,42 +17,26 @@ from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, np.import_array() np.import_ufunc() -from libc.stdlib cimport malloc, free - from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, - PyList_Check, PyFloat_Check, + PyList_Check, PyFloat_Check, PyBool_Check, PyString_Check, PyBytes_Check, PyUnicode_Check, PyTuple_New, PyObject_RichCompareBool, - PyBytes_GET_SIZE, - PyUnicode_GET_SIZE, PyObject) -try: - from cpython cimport PyString_GET_SIZE -except ImportError: - from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE - cimport cpython -isnan = np.isnan -cdef double NaN = np.NaN -cdef double nan = NaN - from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, PyDateTime_IMPORT) PyDateTime_IMPORT -from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value - from tslib import NaT, Timestamp, Timedelta, array_to_datetime from interval import Interval from missing cimport checknull - cimport util cdef int64_t NPY_NAT = util.get_nat() from util cimport is_array, _checknull @@ -138,28 +122,6 @@ def item_from_zerodim(object val): return util.unbox_if_zerodim(val) -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique(ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - list uniques = [] - dict table = {} - object val, stub = 0 - - for i from 0 <= i < n: - val = values[i] - if val not in table: - table[val] = stub - uniques.append(val) - try: - uniques.sort() - except Exception: - pass - - return uniques - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple(list arrays): @@ -379,30 +341,6 @@ def has_infs_f8(ndarray[float64_t] arr): return False -def convert_timestamps(ndarray values): - cdef: - object val, f, result - dict cache = {} - Py_ssize_t i, n = len(values) - ndarray[object] out - - # for HDFStore, a bit temporary but... - - from datetime import datetime - f = datetime.fromtimestamp - - out = np.empty(n, dtype='O') - - for i in range(n): - val = util.get_value_1d(values, i) - if val in cache: - out[i] = cache[val] - else: - cache[val] = out[i] = f(val) - - return out - - def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) @@ -742,145 +680,6 @@ def clean_index_list(list obj): return np.asarray(obj), 0 -ctypedef fused pandas_string: - str - unicode - bytes - - -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): - """ return the maximum size of elements in a 1-dim string array """ - cdef: - Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] - pandas_string v - - for i in range(length): - v = arr[i] - if PyString_Check(v): - l = PyString_GET_SIZE(v) - elif PyBytes_Check(v): - l = PyBytes_GET_SIZE(v) - elif PyUnicode_Check(v): - l = PyUnicode_GET_SIZE(v) - - if l > m: - m = l - - return m - - -@cython.boundscheck(False) -@cython.wraparound(False) -def string_array_replace_from_nan_rep( - ndarray[object, ndim=1] arr, object nan_rep, - object replace=None): - """ - Replace the values in the array with 'replacement' if - they are 'nan_rep'. Return the same array. - """ - - cdef int length = arr.shape[0], i = 0 - if replace is None: - replace = np.nan - - for i from 0 <= i < length: - if arr[i] == nan_rep: - arr[i] = replace - - return arr - - -@cython.boundscheck(False) -@cython.wraparound(False) -def convert_json_to_lines(object arr): - """ - replace comma separated json with line feeds, paying special attention - to quotes & brackets - """ - cdef: - Py_ssize_t i = 0, num_open_brackets_seen = 0, length - bint in_quotes = 0, is_escaping = 0 - ndarray[uint8_t] narr - unsigned char v, comma, left_bracket, right_brack, newline - - newline = ord('\n') - comma = ord(',') - left_bracket = ord('{') - right_bracket = ord('}') - quote = ord('"') - backslash = ord('\\') - - narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy() - length = narr.shape[0] - for i in range(length): - v = narr[i] - if v == quote and i > 0 and not is_escaping: - in_quotes = ~in_quotes - if v == backslash or is_escaping: - is_escaping = ~is_escaping - if v == comma: # commas that should be \n - if num_open_brackets_seen == 0 and not in_quotes: - narr[i] = newline - elif v == left_bracket: - if not in_quotes: - num_open_brackets_seen += 1 - elif v == right_bracket: - if not in_quotes: - num_open_brackets_seen -= 1 - - return narr.tostring().decode('utf-8') - - -@cython.boundscheck(False) -@cython.wraparound(False) -def write_csv_rows(list data, ndarray data_index, - int nlevels, ndarray cols, object writer): - - cdef int N, j, i, ncols - cdef list rows - cdef object val - - # In crude testing, N>100 yields little marginal improvement - N=100 - - # pre-allocate rows - ncols = len(cols) - rows = [[None] * (nlevels + ncols) for x in range(N)] - - j = -1 - if nlevels == 1: - for j in range(len(data_index)): - row = rows[j % N] - row[0] = data_index[j] - for i in range(ncols): - row[1 + i] = data[i][j] - - if j >= N - 1 and j % N == N - 1: - writer.writerows(rows) - elif nlevels > 1: - for j in range(len(data_index)): - row = rows[j % N] - row[:nlevels] = list(data_index[j]) - for i in range(ncols): - row[nlevels + i] = data[i][j] - - if j >= N - 1 and j % N == N - 1: - writer.writerows(rows) - else: - for j in range(len(data_index)): - row = rows[j % N] - for i in range(ncols): - row[i] = data[i][j] - - if j >= N - 1 and j % N == N - 1: - writer.writerows(rows) - - if j >= 0 and (j < N - 1 or (j % N) != N - 1): - writer.writerows(rows[:((j + 1) % N)]) - - # ------------------------------------------------------------------------------ # Groupby-related functions diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index e15f276b39bf8..17ff062c15497 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -6,14 +6,14 @@ from tslibs.nattype import NaT from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 from tslibs.timezones cimport get_timezone, tz_compare -from datetime import datetime, timedelta + iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 -from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, - INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX, - INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN) +from util cimport UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN + +cdef double nan = np.NaN # core.common import for fast inference checks @@ -737,7 +737,7 @@ cdef class IntegerFloatValidator(Validator): return issubclass(self.dtype.type, np.integer) -cpdef bint is_integer_float_array(ndarray values): +cdef bint is_integer_float_array(ndarray values): cdef: IntegerFloatValidator validator = IntegerFloatValidator( len(values), @@ -788,7 +788,7 @@ cdef class UnicodeValidator(Validator): return issubclass(self.dtype.type, np.unicode_) -cpdef bint is_unicode_array(ndarray values, bint skipna=False): +cdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: UnicodeValidator validator = UnicodeValidator( len(values), @@ -807,7 +807,7 @@ cdef class BytesValidator(Validator): return issubclass(self.dtype.type, np.bytes_) -cpdef bint is_bytes_array(ndarray values, bint skipna=False): +cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: BytesValidator validator = BytesValidator( len(values), @@ -1390,34 +1390,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -def convert_sql_column(x): - return maybe_convert_objects(x, try_float=1) - - -def sanitize_objects(ndarray[object] values, set na_values, - convert_empty=True): - cdef: - Py_ssize_t i, n - object val, onan - Py_ssize_t na_count = 0 - dict memo = {} - - n = len(values) - onan = np.nan - - for i from 0 <= i < n: - val = values[i] - if (convert_empty and val == '') or (val in na_values): - values[i] = onan - na_count += 1 - elif val in memo: - values[i] = memo[val] - else: - memo[val] = val - - return na_count - - def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None): cdef: @@ -1443,7 +1415,7 @@ def maybe_convert_bool(ndarray[object] arr, for i from 0 <= i < n: val = arr[i] - if cpython.PyBool_Check(val): + if PyBool_Check(val): if val is True: result[i] = 1 else: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2293032ebb8a1..04d0a048bfdd3 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -38,7 +38,7 @@ _stringify_path) from pandas.io.formats.printing import adjoin, justify, pprint_thing from pandas.io.formats.common import get_level_lengths -from pandas._libs import lib +from pandas._libs import lib, io_helper as libio from pandas._libs.tslib import (iNaT, Timestamp, Timedelta, format_array_from_datetime) from pandas.core.indexes.datetimes import DatetimeIndex @@ -1789,7 +1789,8 @@ def _save_chunk(self, start_i, end_i): date_format=self.date_format, quoting=self.quoting) - lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + libio.write_csv_rows(self.data, ix, self.nlevels, + self.cols, self.writer) # ---------------------------------------------------------------------- diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 595031b04e367..fa03e7bb5caa2 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -5,7 +5,7 @@ from collections import defaultdict import numpy as np -from pandas._libs.lib import convert_json_to_lines +from pandas._libs.io_helper import convert_json_to_lines from pandas import compat, DataFrame diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5135bb01fb378..545b40f2cc96b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -43,6 +43,7 @@ import pandas._libs.lib as lib import pandas._libs.parsers as parsers +from pandas._libs import io_helper as libio from pandas._libs.tslibs import parsing # BOM character (byte order mark) @@ -1596,11 +1597,11 @@ def _infer_types(self, values, na_values, try_num_bool=True): except Exception: result = values if values.dtype == np.object_: - na_count = lib.sanitize_objects(result, na_values, False) + na_count = libio.sanitize_objects(result, na_values, False) else: result = values if values.dtype == np.object_: - na_count = lib.sanitize_objects(values, na_values, False) + na_count = libio.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: result = lib.maybe_convert_bool(values, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 106823199ee93..6efbb3f19d1c3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -47,7 +47,7 @@ from pandas.core.config import get_option from pandas.core.computation.pytables import Expr, maybe_expression -from pandas._libs import algos, lib +from pandas._libs import algos, lib, io_helper as libio from pandas._libs.tslibs import timezones from distutils.version import LooseVersion @@ -3843,7 +3843,7 @@ def read(self, where=None, columns=None, **kwargs): # need a better algorithm tuple_index = long_index.values - unique_tuples = lib.fast_unique(tuple_index) + unique_tuples = libio.fast_unique(tuple_index) unique_tuples = com._asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) @@ -4604,7 +4604,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): if nan_rep is None: nan_rep = 'nan' - data = lib.string_array_replace_from_nan_rep(data, nan_rep) + data = libio.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) @@ -4621,7 +4621,7 @@ def _get_converter(kind, encoding): if kind == 'datetime64': return lambda x: np.asarray(x, dtype='M8[ns]') elif kind == 'datetime': - return lib.convert_timestamps + return libio.convert_timestamps elif kind == 'string': return lambda x: _unconvert_string_array(x, encoding=encoding) else: # pragma: no cover diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b409cf20e9a09..60af6242ee56d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -16,7 +16,8 @@ import numpy as np from dateutil.relativedelta import relativedelta -from pandas._libs.lib import max_len_string_array, infer_dtype +from pandas._libs.lib import infer_dtype +from pandas._libs.io_helper import max_len_string_array from pandas._libs.tslib import NaT, Timestamp import pandas as pd diff --git a/pandas/tests/dtypes/test_io.py b/pandas/tests/dtypes/test_io.py deleted file mode 100644 index 06b61371c9a0b..0000000000000 --- a/pandas/tests/dtypes/test_io.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- - -import numpy as np -import pandas._libs.lib as lib -import pandas.util.testing as tm - -from pandas.compat import long, u - - -class TestParseSQL(object): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - tm.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - tm.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - tm.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - tm.assert_numpy_array_equal(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - tm.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - tm.assert_numpy_array_equal(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 10061204df42a..66e9886a3d998 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -3,7 +3,7 @@ import pytest import numpy as np -from pandas._libs import lib +from pandas._libs import lib, io_helper as libio import pandas.util.testing as tm @@ -12,19 +12,19 @@ class TestMisc(object): def test_max_len_string_array(self): arr = a = np.array(['foo', 'b', np.nan], dtype='object') - assert lib.max_len_string_array(arr) == 3 + assert libio.max_len_string_array(arr) == 3 # unicode arr = a.astype('U').astype(object) - assert lib.max_len_string_array(arr) == 3 + assert libio.max_len_string_array(arr) == 3 # bytes for python3 arr = a.astype('S').astype(object) - assert lib.max_len_string_array(arr) == 3 + assert libio.max_len_string_array(arr) == 3 # raises pytest.raises(TypeError, - lambda: lib.max_len_string_array(arr.astype('U'))) + lambda: libio.max_len_string_array(arr.astype('U'))) def test_fast_unique_multiple_list_gen_sort(self): keys = [['p', 'a'], ['n', 'd'], ['a', 's']] diff --git a/setup.py b/setup.py index 7ade1544ec5cd..c889bac898e33 100755 --- a/setup.py +++ b/setup.py @@ -307,6 +307,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/join.pyx', 'pandas/_libs/indexing.pyx', 'pandas/_libs/interval.pyx', + 'pandas/_libs/io_helper.pyx', 'pandas/_libs/hashing.pyx', 'pandas/_libs/missing.pyx', 'pandas/_libs/reduction.pyx', @@ -486,6 +487,9 @@ def pxd(name): 'pyxfile': '_libs/interval', 'pxdfiles': ['_libs/hashtable'], 'depends': _pxi_dep['interval']}, + '_libs.io_helper': { + 'pyxfile': '_libs/io_helper', + 'pxdfiles': ['_libs/src/util']}, '_libs.join': { 'pyxfile': '_libs/join', 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], From 27c7ffb0b7f8c028e2f2f0efdad962889fa05f93 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 23 Jan 2018 08:53:01 -0800 Subject: [PATCH 2/7] restore convert_sql_columN --- pandas/tests/dtypes/test_io.py | 73 ++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 pandas/tests/dtypes/test_io.py diff --git a/pandas/tests/dtypes/test_io.py b/pandas/tests/dtypes/test_io.py new file mode 100644 index 0000000000000..06b61371c9a0b --- /dev/null +++ b/pandas/tests/dtypes/test_io.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pandas._libs.lib as lib +import pandas.util.testing as tm + +from pandas.compat import long, u + + +class TestParseSQL(object): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + tm.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + tm.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + tm.assert_numpy_array_equal(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + tm.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + tm.assert_numpy_array_equal(result, expected) From 54c52b99dccc6a054749ad938f7a5a5dddef4c9e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 23 Jan 2018 08:55:11 -0800 Subject: [PATCH 3/7] restore convert_sql_column --- pandas/_libs/src/inference.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 17ff062c15497..9518e1a131ac9 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -1390,6 +1390,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects +def convert_sql_column(x): + return maybe_convert_objects(x, try_float=1) + + def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None): cdef: From 855e029634f540a698d2a4e5b95e182e5a033637 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 23 Jan 2018 08:57:25 -0800 Subject: [PATCH 4/7] revert edits that overlap wth 19360 --- pandas/_libs/lib.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e31bc950fc0d5..e58cb367810bd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -17,22 +17,29 @@ from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, np.import_array() np.import_ufunc() +from libc.stdlib cimport malloc, free + from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyList_Check, PyFloat_Check, PyBool_Check, PyString_Check, PyBytes_Check, PyUnicode_Check, PyTuple_New, - PyObject_RichCompareBool, - PyObject) + PyObject_RichCompareBool) cimport cpython +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN + from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, PyDateTime_IMPORT) PyDateTime_IMPORT +from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value + from tslib import NaT, Timestamp, Timedelta, array_to_datetime from interval import Interval from missing cimport checknull From dfccc0788489c080f46edfdc48819ada5754bb24 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 23 Jan 2018 08:59:09 -0800 Subject: [PATCH 5/7] revert edits that overlap with #19360 --- pandas/_libs/src/inference.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 9518e1a131ac9..0015a58d9de34 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -11,9 +11,9 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 -from util cimport UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN - -cdef double nan = np.NaN +from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, + INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX, + INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN) # core.common import for fast inference checks @@ -737,7 +737,7 @@ cdef class IntegerFloatValidator(Validator): return issubclass(self.dtype.type, np.integer) -cdef bint is_integer_float_array(ndarray values): +cpdef bint is_integer_float_array(ndarray values): cdef: IntegerFloatValidator validator = IntegerFloatValidator( len(values), @@ -788,7 +788,7 @@ cdef class UnicodeValidator(Validator): return issubclass(self.dtype.type, np.unicode_) -cdef bint is_unicode_array(ndarray values, bint skipna=False): +cpdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: UnicodeValidator validator = UnicodeValidator( len(values), @@ -807,7 +807,7 @@ cdef class BytesValidator(Validator): return issubclass(self.dtype.type, np.bytes_) -cdef bint is_bytes_array(ndarray values, bint skipna=False): +cpdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: BytesValidator validator = BytesValidator( len(values), From 1a8132d46eb11749ae4ccd8791ab99b1156aa0ea Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 23 Jan 2018 09:06:14 -0800 Subject: [PATCH 6/7] restore whitespace --- pandas/_libs/lib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e58cb367810bd..c634ebfb68b7a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -44,6 +44,7 @@ from tslib import NaT, Timestamp, Timedelta, array_to_datetime from interval import Interval from missing cimport checknull + cimport util cdef int64_t NPY_NAT = util.get_nat() from util cimport is_array, _checknull From 3b273f06b36ee6deb939fc6eebccde929be637f7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 23 Jan 2018 10:03:09 -0800 Subject: [PATCH 7/7] fix import --- pandas/io/pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6efbb3f19d1c3..0558b4d340a17 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4561,7 +4561,7 @@ def _convert_string_array(data, encoding, itemsize=None): # create the sized dtype if itemsize is None: - itemsize = lib.max_len_string_array(_ensure_object(data.ravel())) + itemsize = libio.max_len_string_array(_ensure_object(data.ravel())) data = np.asarray(data, dtype="S%d" % itemsize) return data @@ -4590,7 +4590,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - itemsize = lib.max_len_string_array(_ensure_object(data)) + itemsize = libio.max_len_string_array(_ensure_object(data)) if compat.PY3: dtype = "U{0}".format(itemsize) else: