From fcad4d079d4aa459fd2dd404935d5f245d8004ff Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Jul 2018 00:03:22 -0700 Subject: [PATCH 1/4] CLN: to_datetime internals --- pandas/core/tools/datetimes.py | 410 +++++++++++++++++++-------------- 1 file changed, 232 insertions(+), 178 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 18802d98a347e..baf5a7d47974e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,3 +1,4 @@ +import partial from datetime import datetime, timedelta, time from collections import MutableMapping @@ -38,7 +39,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def _maybe_cache(arg, format, cache, tz, convert_listlike): +def _maybe_cache(arg, format, cache, convert_listlike): """ Create a cache of unique dates from an array of dates @@ -49,8 +50,6 @@ def _maybe_cache(arg, format, cache, tz, convert_listlike): Strftime format to parse time cache : boolean True attempts to create a cache of converted values - tz : string - Timezone of the dates convert_listlike : function Conversion function to apply on dates @@ -66,7 +65,7 @@ def _maybe_cache(arg, format, cache, tz, convert_listlike): from pandas import Index if not Index(arg).is_unique: unique_dates = algorithms.unique(arg) - cache_dates = convert_listlike(unique_dates, True, format, tz=tz) + cache_dates = convert_listlike(unique_dates, True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array @@ -140,6 +139,222 @@ def _return_parsed_timezone_results(result, timezones, box, tz): return tz_results +def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, + unit=None, errors=None, + infer_datetime_format=None, dayfirst=None, + yearfirst=None): + """ + Helper function for to_datetime. Performs the conversions of 1D listlike + of dates + + Parameters + ---------- + arg : list, tuple, ndarray, Series, Index + date to be parced + box : boolean + True boxes result as an Index-like, False returns an ndarray + name : object + None or string for the Index name + tz : object + None or 'utc' + unit : string + None or string of the frequency of the passed data + errors : string + error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + infer_datetime_format : boolean + inferring format behavior from to_datetime + dayfirst : boolean + dayfirst parsing behavior from to_datetime + yearfirst : boolean + yearfirst parsing behavior from to_datetime + + Returns + ------- + ndarray of parsed dates + Returns: + + - Index-like if box=True + - ndarray of Timestamps if box=False + """ + if isinstance(arg, (list, tuple)): + arg = np.array(arg, dtype='O') + + # these are shortcutable + if is_datetime64tz_dtype(arg): + if not isinstance(arg, DatetimeIndex): + return DatetimeIndex(arg, tz=tz, name=name) + if tz == 'utc': + arg = arg.tz_convert(None).tz_localize(tz) + return arg + + elif is_datetime64_ns_dtype(arg): + if box and not isinstance(arg, DatetimeIndex): + try: + return DatetimeIndex(arg, tz=tz, name=name) + except ValueError: + pass + + return arg + + elif unit is not None: + if format is not None: + raise ValueError("cannot specify both format and unit") + arg = getattr(arg, 'values', arg) + result = tslib.array_with_unit_to_datetime(arg, unit, + errors=errors) + if box: + if errors == 'ignore': + from pandas import Index + return Index(result) + + return DatetimeIndex(result, tz=tz, name=name) + return result + elif getattr(arg, 'ndim', 1) > 1: + raise TypeError('arg must be a string, datetime, list, tuple, ' + '1-d array, or Series') + + arg = _ensure_object(arg) + require_iso8601 = False + + if infer_datetime_format and format is None: + format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + + if format is not None: + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case + format_is_iso8601 = _format_is_iso(format) + if format_is_iso8601: + require_iso8601 = not infer_datetime_format + format = None + + try: + result = None + + if format is not None: + # shortcut formatting here + if format == '%Y%m%d': + try: + result = _attempt_YYYYMMDD(arg, errors=errors) + except: + raise ValueError("cannot convert the input to " + "'%Y%m%d' date format") + + # fallback + if result is None: + try: + result, timezones = array_strptime( + arg, format, exact=exact, errors=errors) + if '%Z' in format or '%z' in format: + return _return_parsed_timezone_results( + result, timezones, box, tz) + except tslib.OutOfBoundsDatetime: + if errors == 'raise': + raise + result = arg + except ValueError: + # if format was inferred, try falling back + # to array_to_datetime - terminate here + # for specified formats + if not infer_datetime_format: + if errors == 'raise': + raise + result = arg + + if result is None and (format is None or infer_datetime_format): + result = tslib.array_to_datetime( + arg, + errors=errors, + utc=tz == 'utc', + dayfirst=dayfirst, + yearfirst=yearfirst, + require_iso8601=require_iso8601 + ) + + if is_datetime64_dtype(result) and box: + result = DatetimeIndex(result, tz=tz, name=name) + return result + + except ValueError as e: + try: + values, tz = conversion.datetime_to_datetime64(arg) + return DatetimeIndex._simple_new(values, name=name, tz=tz) + except (ValueError, TypeError): + raise e + + +def _adjust_to_origin(arg, origin, unit): + """ + Adjust input argument to the specified origin + + Parameters + ---------- + arg : list, tuple, ndarray, Series, Index + date to be adjusted + origin : 'julian' or Timestamp + origin offset for the arg + unit : string + passed unit from to_datetime, must be 'D' + Returns + ------- + ndarray of adjusted dates + """ + if origin == 'julian': + original = arg + j0 = tslib.Timestamp(0).to_julian_date() + if unit != 'D': + raise ValueError("unit must be 'D' for origin='julian'") + try: + arg = arg - j0 + except: + raise ValueError("incompatible 'arg' type for given " + "'origin'='julian'") + + # premptively check this for a nice range + j_max = tslib.Timestamp.max.to_julian_date() - j0 + j_min = tslib.Timestamp.min.to_julian_date() - j0 + if np.any(arg > j_max) or np.any(arg < j_min): + raise tslib.OutOfBoundsDatetime( + "{original} is Out of Bounds for " + "origin='julian'".format(original=original)) + else: + # arg must be numeric + if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or + is_numeric_dtype(np.asarray(arg))): + raise ValueError( + "'{arg}' is not compatible with origin='{origin}'; " + "it must be numeric with a unit specified ".format( + arg=arg, + origin=origin)) + + # we are going to offset back to unix / epoch time + try: + offset = tslib.Timestamp(origin) + except tslib.OutOfBoundsDatetime: + raise tslib.OutOfBoundsDatetime( + "origin {origin} is Out of Bounds".format(origin=origin)) + except ValueError: + raise ValueError("origin {origin} cannot be converted " + "to a Timestamp".format(origin=origin)) + + if offset.tz is not None: + raise ValueError( + "origin offset {} must be tz-naive".format(offset)) + offset -= tslib.Timestamp(0) + + # convert the offset to the unit of the arg + # this should be lossless in terms of precision + offset = offset // tslib.Timedelta(1, unit=unit) + + # scalars & ndarray-like can handle the addition + if is_list_like(arg) and not isinstance( + arg, (ABCSeries, ABCIndexClass, np.ndarray)): + arg = np.asarray(arg) + arg = arg + offset + return arg + + def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', @@ -308,205 +523,44 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, """ from pandas.core.indexes.datetimes import DatetimeIndex - tz = 'utc' if utc else None - - def _convert_listlike(arg, box, format, name=None, tz=tz): - - if isinstance(arg, (list, tuple)): - arg = np.array(arg, dtype='O') - - # these are shortcutable - if is_datetime64tz_dtype(arg): - if not isinstance(arg, DatetimeIndex): - return DatetimeIndex(arg, tz=tz, name=name) - if utc: - arg = arg.tz_convert(None).tz_localize('UTC') - return arg - - elif is_datetime64_ns_dtype(arg): - if box and not isinstance(arg, DatetimeIndex): - try: - return DatetimeIndex(arg, tz=tz, name=name) - except ValueError: - pass - - return arg - - elif unit is not None: - if format is not None: - raise ValueError("cannot specify both format and unit") - arg = getattr(arg, 'values', arg) - result = tslib.array_with_unit_to_datetime(arg, unit, - errors=errors) - if box: - if errors == 'ignore': - from pandas import Index - return Index(result) - - return DatetimeIndex(result, tz=tz, name=name) - return result - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, datetime, list, tuple, ' - '1-d array, or Series') - - arg = _ensure_object(arg) - require_iso8601 = False - - if infer_datetime_format and format is None: - format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - - if format is not None: - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - format_is_iso8601 = _format_is_iso(format) - if format_is_iso8601: - require_iso8601 = not infer_datetime_format - format = None - - try: - result = None - - if format is not None: - # shortcut formatting here - if format == '%Y%m%d': - try: - result = _attempt_YYYYMMDD(arg, errors=errors) - except: - raise ValueError("cannot convert the input to " - "'%Y%m%d' date format") - - # fallback - if result is None: - try: - result, timezones = array_strptime( - arg, format, exact=exact, errors=errors) - if '%Z' in format or '%z' in format: - return _return_parsed_timezone_results( - result, timezones, box, tz) - except tslib.OutOfBoundsDatetime: - if errors == 'raise': - raise - result = arg - except ValueError: - # if format was inferred, try falling back - # to array_to_datetime - terminate here - # for specified formats - if not infer_datetime_format: - if errors == 'raise': - raise - result = arg - - if result is None and (format is None or infer_datetime_format): - result = tslib.array_to_datetime( - arg, - errors=errors, - utc=utc, - dayfirst=dayfirst, - yearfirst=yearfirst, - require_iso8601=require_iso8601 - ) - - if is_datetime64_dtype(result) and box: - result = DatetimeIndex(result, tz=tz, name=name) - return result - - except ValueError as e: - try: - values, tz = conversion.datetime_to_datetime64(arg) - return DatetimeIndex._simple_new(values, name=name, tz=tz) - except (ValueError, TypeError): - raise e - if arg is None: return None - # handle origin - if origin == 'julian': + if origin != 'unix': + arg = _adjust_to_origin(arg, origin, unit) - original = arg - j0 = tslib.Timestamp(0).to_julian_date() - if unit != 'D': - raise ValueError("unit must be 'D' for origin='julian'") - try: - arg = arg - j0 - except: - raise ValueError("incompatible 'arg' type for given " - "'origin'='julian'") - - # premptively check this for a nice range - j_max = tslib.Timestamp.max.to_julian_date() - j0 - j_min = tslib.Timestamp.min.to_julian_date() - j0 - if np.any(arg > j_max) or np.any(arg < j_min): - raise tslib.OutOfBoundsDatetime( - "{original} is Out of Bounds for " - "origin='julian'".format(original=original)) - - elif origin not in ['unix', 'julian']: - - # arg must be a numeric - original = arg - if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or - is_numeric_dtype(np.asarray(arg))): - raise ValueError( - "'{arg}' is not compatible with origin='{origin}'; " - "it must be numeric with a unit specified ".format( - arg=arg, - origin=origin)) - - # we are going to offset back to unix / epoch time - try: - offset = tslib.Timestamp(origin) - except tslib.OutOfBoundsDatetime: - raise tslib.OutOfBoundsDatetime( - "origin {origin} is Out of Bounds".format(origin=origin)) - except ValueError: - raise ValueError("origin {origin} cannot be converted " - "to a Timestamp".format(origin=origin)) - - if offset.tz is not None: - raise ValueError( - "origin offset {} must be tz-naive".format(offset)) - offset -= tslib.Timestamp(0) - - # convert the offset to the unit of the arg - # this should be lossless in terms of precision - offset = offset // tslib.Timedelta(1, unit=unit) - - # scalars & ndarray-like can handle the addition - if is_list_like(arg) and not isinstance( - arg, (ABCSeries, ABCIndexClass, np.ndarray)): - arg = np.asarray(arg) - arg = arg + offset + tz = 'utc' if utc else None + convert_listlike = partial(_convert_listlike_datetimes, name=name, tz=tz, + unit=unit, error=errors) if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = arg.map(cache_array) else: from pandas import Series - values = _convert_listlike(arg._values, True, format) + values = convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, box, errors, name=arg.name) else: - result = _convert_listlike(arg, box, format, name=arg.name) + convert_listlike = partial(convert_listlike, name=arg.name) + result = convert_listlike(arg, box, format) elif is_list_like(arg): - cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, box, errors) else: - result = _convert_listlike(arg, box, format) + result = convert_listlike(arg, box, format) else: - result = _convert_listlike(np.array([arg]), box, format)[0] + result = convert_listlike(np.array([arg]), box, format)[0] return result From b2b1104874580766135be872313b58612fb601a7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Jul 2018 09:58:22 -0700 Subject: [PATCH 2/4] More cleanup --- pandas/core/tools/datetimes.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index baf5a7d47974e..9aab30b83bec9 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,4 +1,4 @@ -import partial +from functools import partial from datetime import datetime, timedelta, time from collections import MutableMapping @@ -142,7 +142,7 @@ def _return_parsed_timezone_results(result, timezones, box, tz): def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, unit=None, errors=None, infer_datetime_format=None, dayfirst=None, - yearfirst=None): + yearfirst=None, exact=None): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates @@ -167,7 +167,9 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, dayfirst parsing behavior from to_datetime yearfirst : boolean yearfirst parsing behavior from to_datetime - + exact : boolean + exact format matching behavior from to_datetime + Returns ------- ndarray of parsed dates @@ -176,6 +178,7 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, - Index-like if box=True - ndarray of Timestamps if box=False """ + from pandas import DatetimeIndex if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -521,8 +524,6 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_timedelta : Convert argument to timedelta. """ - from pandas.core.indexes.datetimes import DatetimeIndex - if arg is None: return None @@ -530,8 +531,10 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, arg = _adjust_to_origin(arg, origin, unit) tz = 'utc' if utc else None - convert_listlike = partial(_convert_listlike_datetimes, name=name, tz=tz, - unit=unit, error=errors) + convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit, + dayfirst=dayfirst, yearfirst=dayfirst, + errors=errors, exact=exact, + infer_datetime_format=infer_datetime_format) if isinstance(arg, tslib.Timestamp): result = arg From 3e0c36b56c217d3297e1e7cb6a75ae2d046672ab Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Jul 2018 17:21:27 -0700 Subject: [PATCH 3/4] fix yearfirst typo --- pandas/core/tools/datetimes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9aab30b83bec9..917f0ec14c13e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -289,6 +289,7 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, def _adjust_to_origin(arg, origin, unit): """ + Helper function for to_datetime. Adjust input argument to the specified origin Parameters @@ -532,7 +533,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, tz = 'utc' if utc else None convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit, - dayfirst=dayfirst, yearfirst=dayfirst, + dayfirst=dayfirst, yearfirst=yearfirst, errors=errors, exact=exact, infer_datetime_format=infer_datetime_format) From e9320f103291d83c7db7917cb513e02749447ae4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 2 Jul 2018 22:23:33 -0700 Subject: [PATCH 4/4] adjust spacing --- pandas/core/tools/datetimes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 917f0ec14c13e..a99c913f95e82 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -300,9 +300,10 @@ def _adjust_to_origin(arg, origin, unit): origin offset for the arg unit : string passed unit from to_datetime, must be 'D' + Returns ------- - ndarray of adjusted dates + ndarray or scalar of adjusted date(s) """ if origin == 'julian': original = arg