From cb5fc602395345db2f4e0b22dd3cc90ee5c8bdb4 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 28 Jul 2014 19:05:05 +0100 Subject: [PATCH] BUG: Fixed incorrect string length calculation when writing strings to Stata Strings were incorrectly written using 244 character irrespective of the actual length of the underlying due to changes in pandas where the underlying NumPy datatype of strings is always np.object_, and never np.string_. Closes #7858 String types were also not being checked for excessive length, and DataFrames with strings containing more then 244 characters were producing invalid dta files. Attempting to write long strings raises an error now. --- doc/source/io.rst | 7 +++++++ doc/source/v0.15.0.txt | 7 ++++++- pandas/io/stata.py | 28 +++++++++++++++++++++++++--- pandas/io/tests/test_stata.py | 24 ++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 91ffb5091e927..32af1924aee70 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3529,6 +3529,13 @@ outside of this range, the data is cast to ``int16``. Conversion from ``int64`` to ``float64`` may result in a loss of precision if ``int64`` values are larger than 2**53. +.. warning:: + :class:`~pandas.io.stata.StataWriter`` and + :func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width + strings containing up to 244 characters, a limitation imposed by the version + 115 dta file format. Attempting to write *Stata* dta files with strings + longer than 244 characters raises a ``ValueError``. + .. _io.stata_reader: diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index dbdae6ed7144e..e5ba8efd25b02 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -119,6 +119,11 @@ API changes - The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no effect (:issue:`7762`, :issue:`7032`). +- ``DataFrame.to_stata`` and ``StataWriter`` check string length for + compatibility with limitations imposed in dta files where fixed-width + strings must contain 244 or fewer characters. Attempting to write Stata + dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`) + .. _whatsnew_0150.cat: @@ -312,7 +317,7 @@ Bug Fixes - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) - Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) - +- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`) - Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr`` returning results with columns sorted by name and producing an error for non-unique columns; now handles non-unique columns and returns columns in original order diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3458a95ac096d..5b5ce3e59e16e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,6 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip, text_type, string_types from pandas import isnull from pandas.io.common import get_filepath_or_buffer +from pandas.lib import max_len_string_array, is_string_array from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, @@ -181,6 +182,11 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +excessive_string_length_error = """ +Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters. +Column '%s' does not satisfy this restriction. +""" + class PossiblePrecisionLoss(Warning): pass @@ -1040,12 +1046,14 @@ def _dtype_to_stata_type(dtype): "Please report an error to the developers." % dtype) -def _dtype_to_default_stata_fmt(dtype): +def _dtype_to_default_stata_fmt(dtype, column): """ Maps numpy dtype to stata's default format for this type. Not terribly important since users can change this in Stata. Semantics are string -> "%DDs" where DD is the length of the string + object -> "%DDs" where DD is the length of the string, if a string, or 244 + for anything that cannot be converted to a string. float64 -> "%10.0g" float32 -> "%9.0g" int64 -> "%9.0g" @@ -1055,9 +1063,21 @@ def _dtype_to_default_stata_fmt(dtype): """ #TODO: expand this to handle a default datetime format? if dtype.type == np.string_: + if max_len_string_array(column.values) > 244: + raise ValueError(excessive_string_length_error % column.name) + return "%" + str(dtype.itemsize) + "s" elif dtype.type == np.object_: - return "%244s" + try: + # Try to use optimal size if available + itemsize = max_len_string_array(column.values) + except: + # Default size + itemsize = 244 + if itemsize > 244: + raise ValueError(excessive_string_length_error % column.name) + + return "%" + str(itemsize) + "s" elif dtype == np.float64: return "%10.0g" elif dtype == np.float32: @@ -1264,7 +1284,9 @@ def __iter__(self): ) dtypes[key] = np.dtype(new_type) self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] - self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] + self.fmtlist = [] + for col, dtype in dtypes.iteritems(): + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) # set the given format for the datetime cols if self._convert_dates is not None: for key in self._convert_dates: diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 5271604235922..459a1fe6c0e89 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -565,6 +565,30 @@ def test_variable_labels(self): self.assertTrue(k in keys) self.assertTrue(v in labels) + def test_minimal_size_col(self): + str_lens = (1, 100, 244) + s = {} + for str_len in str_lens: + s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) + original = DataFrame(s) + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + sr = StataReader(path) + variables = sr.varlist + formats = sr.fmtlist + for variable, fmt in zip(variables, formats): + self.assertTrue(int(variable[1:]) == int(fmt[1:-1])) + + def test_excessively_long_string(self): + str_lens = (1, 244, 500) + s = {} + for str_len in str_lens: + s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) + original = DataFrame(s) + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],