diff --git a/doc/source/io.rst b/doc/source/io.rst index 91ffb5091e927..32af1924aee70 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3529,6 +3529,13 @@ outside of this range, the data is cast to ``int16``. Conversion from ``int64`` to ``float64`` may result in a loss of precision if ``int64`` values are larger than 2**53. +.. warning:: + :class:`~pandas.io.stata.StataWriter`` and + :func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width + strings containing up to 244 characters, a limitation imposed by the version + 115 dta file format. Attempting to write *Stata* dta files with strings + longer than 244 characters raises a ``ValueError``. + .. _io.stata_reader: diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index dbdae6ed7144e..e5ba8efd25b02 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -119,6 +119,11 @@ API changes - The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no effect (:issue:`7762`, :issue:`7032`). +- ``DataFrame.to_stata`` and ``StataWriter`` check string length for + compatibility with limitations imposed in dta files where fixed-width + strings must contain 244 or fewer characters. Attempting to write Stata + dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`) + .. _whatsnew_0150.cat: @@ -312,7 +317,7 @@ Bug Fixes - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) - Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) - +- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`) - Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr`` returning results with columns sorted by name and producing an error for non-unique columns; now handles non-unique columns and returns columns in original order diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3458a95ac096d..5b5ce3e59e16e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,6 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip, text_type, string_types from pandas import isnull from pandas.io.common import get_filepath_or_buffer +from pandas.lib import max_len_string_array, is_string_array from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, @@ -181,6 +182,11 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +excessive_string_length_error = """ +Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters. +Column '%s' does not satisfy this restriction. +""" + class PossiblePrecisionLoss(Warning): pass @@ -1040,12 +1046,14 @@ def _dtype_to_stata_type(dtype): "Please report an error to the developers." % dtype) -def _dtype_to_default_stata_fmt(dtype): +def _dtype_to_default_stata_fmt(dtype, column): """ Maps numpy dtype to stata's default format for this type. Not terribly important since users can change this in Stata. Semantics are string -> "%DDs" where DD is the length of the string + object -> "%DDs" where DD is the length of the string, if a string, or 244 + for anything that cannot be converted to a string. float64 -> "%10.0g" float32 -> "%9.0g" int64 -> "%9.0g" @@ -1055,9 +1063,21 @@ def _dtype_to_default_stata_fmt(dtype): """ #TODO: expand this to handle a default datetime format? if dtype.type == np.string_: + if max_len_string_array(column.values) > 244: + raise ValueError(excessive_string_length_error % column.name) + return "%" + str(dtype.itemsize) + "s" elif dtype.type == np.object_: - return "%244s" + try: + # Try to use optimal size if available + itemsize = max_len_string_array(column.values) + except: + # Default size + itemsize = 244 + if itemsize > 244: + raise ValueError(excessive_string_length_error % column.name) + + return "%" + str(itemsize) + "s" elif dtype == np.float64: return "%10.0g" elif dtype == np.float32: @@ -1264,7 +1284,9 @@ def __iter__(self): ) dtypes[key] = np.dtype(new_type) self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] - self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] + self.fmtlist = [] + for col, dtype in dtypes.iteritems(): + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) # set the given format for the datetime cols if self._convert_dates is not None: for key in self._convert_dates: diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 5271604235922..459a1fe6c0e89 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -565,6 +565,30 @@ def test_variable_labels(self): self.assertTrue(k in keys) self.assertTrue(v in labels) + def test_minimal_size_col(self): + str_lens = (1, 100, 244) + s = {} + for str_len in str_lens: + s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) + original = DataFrame(s) + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + sr = StataReader(path) + variables = sr.varlist + formats = sr.fmtlist + for variable, fmt in zip(variables, formats): + self.assertTrue(int(variable[1:]) == int(fmt[1:-1])) + + def test_excessively_long_string(self): + str_lens = (1, 244, 500) + s = {} + for str_len in str_lens: + s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) + original = DataFrame(s) + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],