From d0263c996739588a561b0f80ce3f40cad6a26860 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Mon, 22 Jan 2018 16:09:20 -0800
Subject: [PATCH 1/7] implement io_helper

---
 pandas/_libs/io_helper.pyx     | 233 +++++++++++++++++++++++++++++++++
 pandas/_libs/lib.pyx           | 203 +---------------------------
 pandas/_libs/src/inference.pyx |  44 ++-----
 pandas/io/formats/format.py    |   5 +-
 pandas/io/json/normalize.py    |   2 +-
 pandas/io/parsers.py           |   5 +-
 pandas/io/pytables.py          |   8 +-
 pandas/io/stata.py             |   3 +-
 pandas/tests/dtypes/test_io.py |  73 -----------
 pandas/tests/test_lib.py       |  10 +-
 setup.py                       |   4 +
 11 files changed, 264 insertions(+), 326 deletions(-)
 create mode 100644 pandas/_libs/io_helper.pyx
 delete mode 100644 pandas/tests/dtypes/test_io.py

diff --git a/pandas/_libs/io_helper.pyx b/pandas/_libs/io_helper.pyx
new file mode 100644
index 0000000000000..aa9af96c1bd6c
--- /dev/null
+++ b/pandas/_libs/io_helper.pyx
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+
+cimport cython
+from cython cimport Py_ssize_t
+
+from cpython cimport (PyString_Check, PyBytes_Check, PyUnicode_Check,
+                      PyBytes_GET_SIZE, PyUnicode_GET_SIZE)
+
+try:
+    from cpython cimport PyString_GET_SIZE
+except ImportError:
+    from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport ndarray, uint8_t
+cnp.import_array()
+
+cimport util
+
+
+ctypedef fused pandas_string:
+    str
+    unicode
+    bytes
+
+
+def sanitize_objects(ndarray[object] values, set na_values,
+                     convert_empty=True):
+    cdef:
+        Py_ssize_t i, n
+        object val, onan
+        Py_ssize_t na_count = 0
+        dict memo = {}
+
+    n = len(values)
+    onan = np.nan
+
+    for i from 0 <= i < n:
+        val = values[i]
+        if (convert_empty and val == '') or (val in na_values):
+            values[i] = onan
+            na_count += 1
+        elif val in memo:
+            values[i] = memo[val]
+        else:
+            memo[val] = val
+
+    return na_count
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def write_csv_rows(list data, ndarray data_index,
+                   int nlevels, ndarray cols, object writer):
+
+    cdef int N, j, i, ncols
+    cdef list rows
+    cdef object val
+
+    # In crude testing, N>100 yields little marginal improvement
+    N=100
+
+    # pre-allocate rows
+    ncols = len(cols)
+    rows = [[None] * (nlevels + ncols) for x in range(N)]
+
+    j = -1
+    if nlevels == 1:
+        for j in range(len(data_index)):
+            row = rows[j % N]
+            row[0] = data_index[j]
+            for i in range(ncols):
+                row[1 + i] = data[i][j]
+
+            if j >= N - 1 and j % N == N - 1:
+                writer.writerows(rows)
+    elif nlevels > 1:
+        for j in range(len(data_index)):
+            row = rows[j % N]
+            row[:nlevels] = list(data_index[j])
+            for i in range(ncols):
+                row[nlevels + i] = data[i][j]
+
+            if j >= N - 1 and j % N == N - 1:
+                writer.writerows(rows)
+    else:
+        for j in range(len(data_index)):
+            row = rows[j % N]
+            for i in range(ncols):
+                row[i] = data[i][j]
+
+            if j >= N - 1 and j % N == N - 1:
+                writer.writerows(rows)
+
+    if j >= 0 and (j < N - 1 or (j % N) != N - 1):
+        writer.writerows(rows[:((j + 1) % N)])
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def convert_json_to_lines(object arr):
+    """
+    replace comma separated json with line feeds, paying special attention
+    to quotes & brackets
+    """
+    cdef:
+        Py_ssize_t i = 0, num_open_brackets_seen = 0, length
+        bint in_quotes = 0, is_escaping = 0
+        ndarray[uint8_t] narr
+        unsigned char v, comma, left_bracket, right_brack, newline
+
+    newline = ord('\n')
+    comma = ord(',')
+    left_bracket = ord('{')
+    right_bracket = ord('}')
+    quote = ord('"')
+    backslash = ord('\\')
+
+    narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
+    length = narr.shape[0]
+    for i in range(length):
+        v = narr[i]
+        if v == quote and i > 0 and not is_escaping:
+            in_quotes = ~in_quotes
+        if v == backslash or is_escaping:
+            is_escaping = ~is_escaping
+        if v == comma:  # commas that should be \n
+            if num_open_brackets_seen == 0 and not in_quotes:
+                narr[i] = newline
+        elif v == left_bracket:
+            if not in_quotes:
+                num_open_brackets_seen += 1
+        elif v == right_bracket:
+            if not in_quotes:
+                num_open_brackets_seen -= 1
+
+    return narr.tostring().decode('utf-8')
+
+
+# stata, pytables
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
+    """ return the maximum size of elements in a 1-dim string array """
+    cdef:
+        Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
+        pandas_string v
+
+    for i in range(length):
+        v = arr[i]
+        if PyString_Check(v):
+            l = PyString_GET_SIZE(v)
+        elif PyBytes_Check(v):
+            l = PyBytes_GET_SIZE(v)
+        elif PyUnicode_Check(v):
+            l = PyUnicode_GET_SIZE(v)
+
+        if l > m:
+            m = l
+
+    return m
+
+
+# ------------------------------------------------------------------
+# PyTables Helpers
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def string_array_replace_from_nan_rep(
+        ndarray[object, ndim=1] arr, object nan_rep,
+        object replace=None):
+    """
+    Replace the values in the array with 'replacement' if
+    they are 'nan_rep'. Return the same array.
+    """
+
+    cdef int length = arr.shape[0], i = 0
+    if replace is None:
+        replace = np.nan
+
+    for i from 0 <= i < length:
+        if arr[i] == nan_rep:
+            arr[i] = replace
+
+    return arr
+
+
+def convert_timestamps(ndarray values):
+    cdef:
+        object val, f, result
+        dict cache = {}
+        Py_ssize_t i, n = len(values)
+        ndarray[object] out
+
+    # for HDFStore, a bit temporary but...
+
+    from datetime import datetime
+    f = datetime.fromtimestamp
+
+    out = np.empty(n, dtype='O')
+
+    for i in range(n):
+        val = util.get_value_1d(values, i)
+        if val in cache:
+            out[i] = cache[val]
+        else:
+            cache[val] = out[i] = f(val)
+
+    return out
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def fast_unique(ndarray[object] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        list uniques = []
+        dict table = {}
+        object val, stub = 0
+
+    for i from 0 <= i < n:
+        val = values[i]
+        if val not in table:
+            table[val] = stub
+            uniques.append(val)
+    try:
+        uniques.sort()
+    except Exception:
+        pass
+
+    return uniques
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 1632f5d016439..e31bc950fc0d5 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -17,42 +17,26 @@ from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM,
 np.import_array()
 np.import_ufunc()
 
-from libc.stdlib cimport malloc, free
-
 from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
-                      PyList_Check, PyFloat_Check,
+                      PyList_Check, PyFloat_Check, PyBool_Check,
                       PyString_Check,
                       PyBytes_Check,
                       PyUnicode_Check,
                       PyTuple_New,
                       PyObject_RichCompareBool,
-                      PyBytes_GET_SIZE,
-                      PyUnicode_GET_SIZE,
                       PyObject)
 
-try:
-    from cpython cimport PyString_GET_SIZE
-except ImportError:
-    from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
-
 cimport cpython
 
-isnan = np.isnan
-cdef double NaN = <double> np.NaN
-cdef double nan = NaN
-
 from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
                                PyTime_Check, PyDelta_Check,
                                PyDateTime_IMPORT)
 PyDateTime_IMPORT
 
-from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value
-
 from tslib import NaT, Timestamp, Timedelta, array_to_datetime
 from interval import Interval
 from missing cimport checknull
 
-
 cimport util
 cdef int64_t NPY_NAT = util.get_nat()
 from util cimport is_array, _checknull
@@ -138,28 +122,6 @@ def item_from_zerodim(object val):
     return util.unbox_if_zerodim(val)
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def fast_unique(ndarray[object] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        list uniques = []
-        dict table = {}
-        object val, stub = 0
-
-    for i from 0 <= i < n:
-        val = values[i]
-        if val not in table:
-            table[val] = stub
-            uniques.append(val)
-    try:
-        uniques.sort()
-    except Exception:
-        pass
-
-    return uniques
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def fast_unique_multiple(list arrays):
@@ -379,30 +341,6 @@ def has_infs_f8(ndarray[float64_t] arr):
     return False
 
 
-def convert_timestamps(ndarray values):
-    cdef:
-        object val, f, result
-        dict cache = {}
-        Py_ssize_t i, n = len(values)
-        ndarray[object] out
-
-    # for HDFStore, a bit temporary but...
-
-    from datetime import datetime
-    f = datetime.fromtimestamp
-
-    out = np.empty(n, dtype='O')
-
-    for i in range(n):
-        val = util.get_value_1d(values, i)
-        if val in cache:
-            out[i] = cache[val]
-        else:
-            cache[val] = out[i] = f(val)
-
-    return out
-
-
 def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len):
     cdef:
         Py_ssize_t i, n = len(indices)
@@ -742,145 +680,6 @@ def clean_index_list(list obj):
     return np.asarray(obj), 0
 
 
-ctypedef fused pandas_string:
-    str
-    unicode
-    bytes
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
-    """ return the maximum size of elements in a 1-dim string array """
-    cdef:
-        Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
-        pandas_string v
-
-    for i in range(length):
-        v = arr[i]
-        if PyString_Check(v):
-            l = PyString_GET_SIZE(v)
-        elif PyBytes_Check(v):
-            l = PyBytes_GET_SIZE(v)
-        elif PyUnicode_Check(v):
-            l = PyUnicode_GET_SIZE(v)
-
-        if l > m:
-            m = l
-
-    return m
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def string_array_replace_from_nan_rep(
-        ndarray[object, ndim=1] arr, object nan_rep,
-        object replace=None):
-    """
-    Replace the values in the array with 'replacement' if
-    they are 'nan_rep'. Return the same array.
-    """
-
-    cdef int length = arr.shape[0], i = 0
-    if replace is None:
-        replace = np.nan
-
-    for i from 0 <= i < length:
-        if arr[i] == nan_rep:
-            arr[i] = replace
-
-    return arr
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def convert_json_to_lines(object arr):
-    """
-    replace comma separated json with line feeds, paying special attention
-    to quotes & brackets
-    """
-    cdef:
-        Py_ssize_t i = 0, num_open_brackets_seen = 0, length
-        bint in_quotes = 0, is_escaping = 0
-        ndarray[uint8_t] narr
-        unsigned char v, comma, left_bracket, right_brack, newline
-
-    newline = ord('\n')
-    comma = ord(',')
-    left_bracket = ord('{')
-    right_bracket = ord('}')
-    quote = ord('"')
-    backslash = ord('\\')
-
-    narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
-    length = narr.shape[0]
-    for i in range(length):
-        v = narr[i]
-        if v == quote and i > 0 and not is_escaping:
-            in_quotes = ~in_quotes
-        if v == backslash or is_escaping:
-            is_escaping = ~is_escaping
-        if v == comma:  # commas that should be \n
-            if num_open_brackets_seen == 0 and not in_quotes:
-                narr[i] = newline
-        elif v == left_bracket:
-            if not in_quotes:
-                num_open_brackets_seen += 1
-        elif v == right_bracket:
-            if not in_quotes:
-                num_open_brackets_seen -= 1
-
-    return narr.tostring().decode('utf-8')
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def write_csv_rows(list data, ndarray data_index,
-                   int nlevels, ndarray cols, object writer):
-
-    cdef int N, j, i, ncols
-    cdef list rows
-    cdef object val
-
-    # In crude testing, N>100 yields little marginal improvement
-    N=100
-
-    # pre-allocate rows
-    ncols = len(cols)
-    rows = [[None] * (nlevels + ncols) for x in range(N)]
-
-    j = -1
-    if nlevels == 1:
-        for j in range(len(data_index)):
-            row = rows[j % N]
-            row[0] = data_index[j]
-            for i in range(ncols):
-                row[1 + i] = data[i][j]
-
-            if j >= N - 1 and j % N == N - 1:
-                writer.writerows(rows)
-    elif nlevels > 1:
-        for j in range(len(data_index)):
-            row = rows[j % N]
-            row[:nlevels] = list(data_index[j])
-            for i in range(ncols):
-                row[nlevels + i] = data[i][j]
-
-            if j >= N - 1 and j % N == N - 1:
-                writer.writerows(rows)
-    else:
-        for j in range(len(data_index)):
-            row = rows[j % N]
-            for i in range(ncols):
-                row[i] = data[i][j]
-
-            if j >= N - 1 and j % N == N - 1:
-                writer.writerows(rows)
-
-    if j >= 0 and (j < N - 1 or (j % N) != N - 1):
-        writer.writerows(rows[:((j + 1) % N)])
-
-
 # ------------------------------------------------------------------------------
 # Groupby-related functions
 
diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
index e15f276b39bf8..17ff062c15497 100644
--- a/pandas/_libs/src/inference.pyx
+++ b/pandas/_libs/src/inference.pyx
@@ -6,14 +6,14 @@ from tslibs.nattype import NaT
 from tslibs.conversion cimport convert_to_tsobject
 from tslibs.timedeltas cimport convert_to_timedelta64
 from tslibs.timezones cimport get_timezone, tz_compare
-from datetime import datetime, timedelta
+
 iNaT = util.get_nat()
 
 cdef bint PY2 = sys.version_info[0] == 2
 
-from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,
-                   INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX,
-                   INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN)
+from util cimport UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN
+
+cdef double nan = <double> np.NaN
 
 # core.common import for fast inference checks
 
@@ -737,7 +737,7 @@ cdef class IntegerFloatValidator(Validator):
         return issubclass(self.dtype.type, np.integer)
 
 
-cpdef bint is_integer_float_array(ndarray values):
+cdef bint is_integer_float_array(ndarray values):
     cdef:
         IntegerFloatValidator validator = IntegerFloatValidator(
             len(values),
@@ -788,7 +788,7 @@ cdef class UnicodeValidator(Validator):
         return issubclass(self.dtype.type, np.unicode_)
 
 
-cpdef bint is_unicode_array(ndarray values, bint skipna=False):
+cdef bint is_unicode_array(ndarray values, bint skipna=False):
     cdef:
         UnicodeValidator validator = UnicodeValidator(
             len(values),
@@ -807,7 +807,7 @@ cdef class BytesValidator(Validator):
         return issubclass(self.dtype.type, np.bytes_)
 
 
-cpdef bint is_bytes_array(ndarray values, bint skipna=False):
+cdef bint is_bytes_array(ndarray values, bint skipna=False):
     cdef:
         BytesValidator validator = BytesValidator(
             len(values),
@@ -1390,34 +1390,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     return objects
 
 
-def convert_sql_column(x):
-    return maybe_convert_objects(x, try_float=1)
-
-
-def sanitize_objects(ndarray[object] values, set na_values,
-                     convert_empty=True):
-    cdef:
-        Py_ssize_t i, n
-        object val, onan
-        Py_ssize_t na_count = 0
-        dict memo = {}
-
-    n = len(values)
-    onan = np.nan
-
-    for i from 0 <= i < n:
-        val = values[i]
-        if (convert_empty and val == '') or (val in na_values):
-            values[i] = onan
-            na_count += 1
-        elif val in memo:
-            values[i] = memo[val]
-        else:
-            memo[val] = val
-
-    return na_count
-
-
 def maybe_convert_bool(ndarray[object] arr,
                        true_values=None, false_values=None):
     cdef:
@@ -1443,7 +1415,7 @@ def maybe_convert_bool(ndarray[object] arr,
     for i from 0 <= i < n:
         val = arr[i]
 
-        if cpython.PyBool_Check(val):
+        if PyBool_Check(val):
             if val is True:
                 result[i] = 1
             else:
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 2293032ebb8a1..04d0a048bfdd3 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -38,7 +38,7 @@
                               _stringify_path)
 from pandas.io.formats.printing import adjoin, justify, pprint_thing
 from pandas.io.formats.common import get_level_lengths
-from pandas._libs import lib
+from pandas._libs import lib, io_helper as libio
 from pandas._libs.tslib import (iNaT, Timestamp, Timedelta,
                                 format_array_from_datetime)
 from pandas.core.indexes.datetimes import DatetimeIndex
@@ -1789,7 +1789,8 @@ def _save_chunk(self, start_i, end_i):
                                         date_format=self.date_format,
                                         quoting=self.quoting)
 
-        lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
+        libio.write_csv_rows(self.data, ix, self.nlevels,
+                             self.cols, self.writer)
 
 
 # ----------------------------------------------------------------------
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
index 595031b04e367..fa03e7bb5caa2 100644
--- a/pandas/io/json/normalize.py
+++ b/pandas/io/json/normalize.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 import numpy as np
 
-from pandas._libs.lib import convert_json_to_lines
+from pandas._libs.io_helper import convert_json_to_lines
 from pandas import compat, DataFrame
 
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5135bb01fb378..545b40f2cc96b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -43,6 +43,7 @@
 
 import pandas._libs.lib as lib
 import pandas._libs.parsers as parsers
+from pandas._libs import io_helper as libio
 from pandas._libs.tslibs import parsing
 
 # BOM character (byte order mark)
@@ -1596,11 +1597,11 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             except Exception:
                 result = values
                 if values.dtype == np.object_:
-                    na_count = lib.sanitize_objects(result, na_values, False)
+                    na_count = libio.sanitize_objects(result, na_values, False)
         else:
             result = values
             if values.dtype == np.object_:
-                na_count = lib.sanitize_objects(values, na_values, False)
+                na_count = libio.sanitize_objects(values, na_values, False)
 
         if result.dtype == np.object_ and try_num_bool:
             result = lib.maybe_convert_bool(values,
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 106823199ee93..6efbb3f19d1c3 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -47,7 +47,7 @@
 from pandas.core.config import get_option
 from pandas.core.computation.pytables import Expr, maybe_expression
 
-from pandas._libs import algos, lib
+from pandas._libs import algos, lib, io_helper as libio
 from pandas._libs.tslibs import timezones
 
 from distutils.version import LooseVersion
@@ -3843,7 +3843,7 @@ def read(self, where=None, columns=None, **kwargs):
                 # need a better algorithm
                 tuple_index = long_index.values
 
-                unique_tuples = lib.fast_unique(tuple_index)
+                unique_tuples = libio.fast_unique(tuple_index)
                 unique_tuples = com._asarray_tuplesafe(unique_tuples)
 
                 indexer = match(unique_tuples, tuple_index)
@@ -4604,7 +4604,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
     if nan_rep is None:
         nan_rep = 'nan'
 
-    data = lib.string_array_replace_from_nan_rep(data, nan_rep)
+    data = libio.string_array_replace_from_nan_rep(data, nan_rep)
     return data.reshape(shape)
 
 
@@ -4621,7 +4621,7 @@ def _get_converter(kind, encoding):
     if kind == 'datetime64':
         return lambda x: np.asarray(x, dtype='M8[ns]')
     elif kind == 'datetime':
-        return lib.convert_timestamps
+        return libio.convert_timestamps
     elif kind == 'string':
         return lambda x: _unconvert_string_array(x, encoding=encoding)
     else:  # pragma: no cover
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index b409cf20e9a09..60af6242ee56d 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -16,7 +16,8 @@
 
 import numpy as np
 from dateutil.relativedelta import relativedelta
-from pandas._libs.lib import max_len_string_array, infer_dtype
+from pandas._libs.lib import infer_dtype
+from pandas._libs.io_helper import max_len_string_array
 from pandas._libs.tslib import NaT, Timestamp
 
 import pandas as pd
diff --git a/pandas/tests/dtypes/test_io.py b/pandas/tests/dtypes/test_io.py
deleted file mode 100644
index 06b61371c9a0b..0000000000000
--- a/pandas/tests/dtypes/test_io.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import numpy as np
-import pandas._libs.lib as lib
-import pandas.util.testing as tm
-
-from pandas.compat import long, u
-
-
-class TestParseSQL(object):
-
-    def test_convert_sql_column_floats(self):
-        arr = np.array([1.5, None, 3, 4.2], dtype=object)
-        result = lib.convert_sql_column(arr)
-        expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
-        tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_sql_column_strings(self):
-        arr = np.array(['1.5', None, '3', '4.2'], dtype=object)
-        result = lib.convert_sql_column(arr)
-        expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object)
-        tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_sql_column_unicode(self):
-        arr = np.array([u('1.5'), None, u('3'), u('4.2')],
-                       dtype=object)
-        result = lib.convert_sql_column(arr)
-        expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')],
-                            dtype=object)
-        tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_sql_column_ints(self):
-        arr = np.array([1, 2, 3, 4], dtype='O')
-        arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O')
-        result = lib.convert_sql_column(arr)
-        result2 = lib.convert_sql_column(arr2)
-        expected = np.array([1, 2, 3, 4], dtype='i8')
-        tm.assert_numpy_array_equal(result, expected)
-        tm.assert_numpy_array_equal(result2, expected)
-
-        arr = np.array([1, 2, 3, None, 4], dtype='O')
-        result = lib.convert_sql_column(arr)
-        expected = np.array([1, 2, 3, np.nan, 4], dtype='f8')
-        tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_sql_column_longs(self):
-        arr = np.array([long(1), long(2), long(3), long(4)], dtype='O')
-        result = lib.convert_sql_column(arr)
-        expected = np.array([1, 2, 3, 4], dtype='i8')
-        tm.assert_numpy_array_equal(result, expected)
-
-        arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O')
-        result = lib.convert_sql_column(arr)
-        expected = np.array([1, 2, 3, np.nan, 4], dtype='f8')
-        tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_sql_column_bools(self):
-        arr = np.array([True, False, True, False], dtype='O')
-        result = lib.convert_sql_column(arr)
-        expected = np.array([True, False, True, False], dtype=bool)
-        tm.assert_numpy_array_equal(result, expected)
-
-        arr = np.array([True, False, None, False], dtype='O')
-        result = lib.convert_sql_column(arr)
-        expected = np.array([True, False, np.nan, False], dtype=object)
-        tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_sql_column_decimals(self):
-        from decimal import Decimal
-        arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')])
-        result = lib.convert_sql_column(arr)
-        expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
-        tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
index 10061204df42a..66e9886a3d998 100644
--- a/pandas/tests/test_lib.py
+++ b/pandas/tests/test_lib.py
@@ -3,7 +3,7 @@
 import pytest
 
 import numpy as np
-from pandas._libs import lib
+from pandas._libs import lib, io_helper as libio
 import pandas.util.testing as tm
 
 
@@ -12,19 +12,19 @@ class TestMisc(object):
     def test_max_len_string_array(self):
 
         arr = a = np.array(['foo', 'b', np.nan], dtype='object')
-        assert lib.max_len_string_array(arr) == 3
+        assert libio.max_len_string_array(arr) == 3
 
         # unicode
         arr = a.astype('U').astype(object)
-        assert lib.max_len_string_array(arr) == 3
+        assert libio.max_len_string_array(arr) == 3
 
         # bytes for python3
         arr = a.astype('S').astype(object)
-        assert lib.max_len_string_array(arr) == 3
+        assert libio.max_len_string_array(arr) == 3
 
         # raises
         pytest.raises(TypeError,
-                      lambda: lib.max_len_string_array(arr.astype('U')))
+                      lambda: libio.max_len_string_array(arr.astype('U')))
 
     def test_fast_unique_multiple_list_gen_sort(self):
         keys = [['p', 'a'], ['n', 'd'], ['a', 's']]
diff --git a/setup.py b/setup.py
index 7ade1544ec5cd..c889bac898e33 100755
--- a/setup.py
+++ b/setup.py
@@ -307,6 +307,7 @@ class CheckSDist(sdist_class):
                  'pandas/_libs/join.pyx',
                  'pandas/_libs/indexing.pyx',
                  'pandas/_libs/interval.pyx',
+                 'pandas/_libs/io_helper.pyx',
                  'pandas/_libs/hashing.pyx',
                  'pandas/_libs/missing.pyx',
                  'pandas/_libs/reduction.pyx',
@@ -486,6 +487,9 @@ def pxd(name):
         'pyxfile': '_libs/interval',
         'pxdfiles': ['_libs/hashtable'],
         'depends': _pxi_dep['interval']},
+    '_libs.io_helper': {
+        'pyxfile': '_libs/io_helper',
+        'pxdfiles': ['_libs/src/util']},
     '_libs.join': {
         'pyxfile': '_libs/join',
         'pxdfiles': ['_libs/src/util', '_libs/hashtable'],

From 27c7ffb0b7f8c028e2f2f0efdad962889fa05f93 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Tue, 23 Jan 2018 08:53:01 -0800
Subject: [PATCH 2/7] restore convert_sql_columN

---
 pandas/tests/dtypes/test_io.py | 73 ++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 pandas/tests/dtypes/test_io.py

diff --git a/pandas/tests/dtypes/test_io.py b/pandas/tests/dtypes/test_io.py
new file mode 100644
index 0000000000000..06b61371c9a0b
--- /dev/null
+++ b/pandas/tests/dtypes/test_io.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pandas._libs.lib as lib
+import pandas.util.testing as tm
+
+from pandas.compat import long, u
+
+
+class TestParseSQL(object):
+
+    def test_convert_sql_column_floats(self):
+        arr = np.array([1.5, None, 3, 4.2], dtype=object)
+        result = lib.convert_sql_column(arr)
+        expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_convert_sql_column_strings(self):
+        arr = np.array(['1.5', None, '3', '4.2'], dtype=object)
+        result = lib.convert_sql_column(arr)
+        expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_convert_sql_column_unicode(self):
+        arr = np.array([u('1.5'), None, u('3'), u('4.2')],
+                       dtype=object)
+        result = lib.convert_sql_column(arr)
+        expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')],
+                            dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_convert_sql_column_ints(self):
+        arr = np.array([1, 2, 3, 4], dtype='O')
+        arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O')
+        result = lib.convert_sql_column(arr)
+        result2 = lib.convert_sql_column(arr2)
+        expected = np.array([1, 2, 3, 4], dtype='i8')
+        tm.assert_numpy_array_equal(result, expected)
+        tm.assert_numpy_array_equal(result2, expected)
+
+        arr = np.array([1, 2, 3, None, 4], dtype='O')
+        result = lib.convert_sql_column(arr)
+        expected = np.array([1, 2, 3, np.nan, 4], dtype='f8')
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_convert_sql_column_longs(self):
+        arr = np.array([long(1), long(2), long(3), long(4)], dtype='O')
+        result = lib.convert_sql_column(arr)
+        expected = np.array([1, 2, 3, 4], dtype='i8')
+        tm.assert_numpy_array_equal(result, expected)
+
+        arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O')
+        result = lib.convert_sql_column(arr)
+        expected = np.array([1, 2, 3, np.nan, 4], dtype='f8')
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_convert_sql_column_bools(self):
+        arr = np.array([True, False, True, False], dtype='O')
+        result = lib.convert_sql_column(arr)
+        expected = np.array([True, False, True, False], dtype=bool)
+        tm.assert_numpy_array_equal(result, expected)
+
+        arr = np.array([True, False, None, False], dtype='O')
+        result = lib.convert_sql_column(arr)
+        expected = np.array([True, False, np.nan, False], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_convert_sql_column_decimals(self):
+        from decimal import Decimal
+        arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')])
+        result = lib.convert_sql_column(arr)
+        expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
+        tm.assert_numpy_array_equal(result, expected)

From 54c52b99dccc6a054749ad938f7a5a5dddef4c9e Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Tue, 23 Jan 2018 08:55:11 -0800
Subject: [PATCH 3/7] restore convert_sql_column

---
 pandas/_libs/src/inference.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
index 17ff062c15497..9518e1a131ac9 100644
--- a/pandas/_libs/src/inference.pyx
+++ b/pandas/_libs/src/inference.pyx
@@ -1390,6 +1390,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     return objects
 
 
+def convert_sql_column(x):
+    return maybe_convert_objects(x, try_float=1)
+
+
 def maybe_convert_bool(ndarray[object] arr,
                        true_values=None, false_values=None):
     cdef:

From 855e029634f540a698d2a4e5b95e182e5a033637 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Tue, 23 Jan 2018 08:57:25 -0800
Subject: [PATCH 4/7] revert edits that overlap wth 19360

---
 pandas/_libs/lib.pyx | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e31bc950fc0d5..e58cb367810bd 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -17,22 +17,29 @@ from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM,
 np.import_array()
 np.import_ufunc()
 
+from libc.stdlib cimport malloc, free
+
 from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
                       PyList_Check, PyFloat_Check, PyBool_Check,
                       PyString_Check,
                       PyBytes_Check,
                       PyUnicode_Check,
                       PyTuple_New,
-                      PyObject_RichCompareBool,
-                      PyObject)
+                      PyObject_RichCompareBool)
 
 cimport cpython
 
+isnan = np.isnan
+cdef double NaN = <double> np.NaN
+cdef double nan = NaN
+
 from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
                                PyTime_Check, PyDelta_Check,
                                PyDateTime_IMPORT)
 PyDateTime_IMPORT
 
+from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value
+
 from tslib import NaT, Timestamp, Timedelta, array_to_datetime
 from interval import Interval
 from missing cimport checknull

From dfccc0788489c080f46edfdc48819ada5754bb24 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Tue, 23 Jan 2018 08:59:09 -0800
Subject: [PATCH 5/7] revert edits that overlap with #19360

---
 pandas/_libs/src/inference.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
index 9518e1a131ac9..0015a58d9de34 100644
--- a/pandas/_libs/src/inference.pyx
+++ b/pandas/_libs/src/inference.pyx
@@ -11,9 +11,9 @@ iNaT = util.get_nat()
 
 cdef bint PY2 = sys.version_info[0] == 2
 
-from util cimport UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN
-
-cdef double nan = <double> np.NaN
+from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,
+                   INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX,
+                   INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN)
 
 # core.common import for fast inference checks
 
@@ -737,7 +737,7 @@ cdef class IntegerFloatValidator(Validator):
         return issubclass(self.dtype.type, np.integer)
 
 
-cdef bint is_integer_float_array(ndarray values):
+cpdef bint is_integer_float_array(ndarray values):
     cdef:
         IntegerFloatValidator validator = IntegerFloatValidator(
             len(values),
@@ -788,7 +788,7 @@ cdef class UnicodeValidator(Validator):
         return issubclass(self.dtype.type, np.unicode_)
 
 
-cdef bint is_unicode_array(ndarray values, bint skipna=False):
+cpdef bint is_unicode_array(ndarray values, bint skipna=False):
     cdef:
         UnicodeValidator validator = UnicodeValidator(
             len(values),
@@ -807,7 +807,7 @@ cdef class BytesValidator(Validator):
         return issubclass(self.dtype.type, np.bytes_)
 
 
-cdef bint is_bytes_array(ndarray values, bint skipna=False):
+cpdef bint is_bytes_array(ndarray values, bint skipna=False):
     cdef:
         BytesValidator validator = BytesValidator(
             len(values),

From 1a8132d46eb11749ae4ccd8791ab99b1156aa0ea Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Tue, 23 Jan 2018 09:06:14 -0800
Subject: [PATCH 6/7] restore whitespace

---
 pandas/_libs/lib.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e58cb367810bd..c634ebfb68b7a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -44,6 +44,7 @@ from tslib import NaT, Timestamp, Timedelta, array_to_datetime
 from interval import Interval
 from missing cimport checknull
 
+
 cimport util
 cdef int64_t NPY_NAT = util.get_nat()
 from util cimport is_array, _checknull

From 3b273f06b36ee6deb939fc6eebccde929be637f7 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Tue, 23 Jan 2018 10:03:09 -0800
Subject: [PATCH 7/7] fix import

---
 pandas/io/pytables.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 6efbb3f19d1c3..0558b4d340a17 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -4561,7 +4561,7 @@ def _convert_string_array(data, encoding, itemsize=None):
 
     # create the sized dtype
     if itemsize is None:
-        itemsize = lib.max_len_string_array(_ensure_object(data.ravel()))
+        itemsize = libio.max_len_string_array(_ensure_object(data.ravel()))
 
     data = np.asarray(data, dtype="S%d" % itemsize)
     return data
@@ -4590,7 +4590,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
     encoding = _ensure_encoding(encoding)
     if encoding is not None and len(data):
 
-        itemsize = lib.max_len_string_array(_ensure_object(data))
+        itemsize = libio.max_len_string_array(_ensure_object(data))
         if compat.PY3:
             dtype = "U{0}".format(itemsize)
         else: