From f58fb0c40942f6de5e4786a7359d0978909a6cc0 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 22 Aug 2013 13:33:04 -0400 Subject: [PATCH] BUG: (GH4626) Fix decoding based on a passed in non-default encoding in pd.read_stata --- doc/source/release.rst | 1 + pandas/io/common.py | 38 ++++++++++++++++------- pandas/io/stata.py | 26 +++++++--------- pandas/io/tests/data/stata1_encoding.dta | Bin 0 -> 3507 bytes pandas/io/tests/test_stata.py | 18 ++++++++++- 5 files changed, 57 insertions(+), 26 deletions(-) create mode 100644 pandas/io/tests/data/stata1_encoding.dta diff --git a/doc/source/release.rst b/doc/source/release.rst index 16179cdeca052..929c167cd1340 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -286,6 +286,7 @@ See :ref:`Internal Refactoring` the DateOffset from being cached (:issue:`4609`) - Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the rhs (:issue:`4576`) - Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` (:issue:`4667`) + - Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` (:issue:`4626`) pandas 0.12 =========== diff --git a/pandas/io/common.py b/pandas/io/common.py index f4a4ef789510c..02242c5a91493 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -66,6 +66,32 @@ def _is_s3_url(url): return False +def maybe_read_encoded_stream(reader, encoding=None): + """ read an encoded stream from the reader and transform the bytes to unicode + if required based on the encoding + + Parameters + ---------- + reader : a streamable file-like object + encoding : optional, the encoding to attempt to read + + Returns + ------- + a tuple of (a stream of decoded bytes, the encoding which was used) + + """ + + if compat.PY3 or encoding is not None: # pragma: no cover + if encoding: + errors = 'strict' + else: + errors = 'replace' + encoding = 'utf-8' + reader = StringIO(reader.read().decode(encoding, errors)) + else: + encoding = None + return reader, encoding + def get_filepath_or_buffer(filepath_or_buffer, encoding=None): """ If the filepath_or_buffer is a url, translate and return the buffer @@ -83,17 +109,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) - if compat.PY3: # pragma: no cover - if encoding: - errors = 'strict' - else: - errors = 'replace' - encoding = 'utf-8' - out = StringIO(req.read().decode(encoding, errors)) - else: - encoding = None - out = req - return out, encoding + return maybe_read_encoded_stream(req,encoding) if _is_s3_url(filepath_or_buffer): try: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 9d21e10d69982..bab90a77c56e8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -24,7 +24,7 @@ from pandas.compat import StringIO, long, lrange, lmap, lzip from pandas import isnull from pandas.io.parsers import _parser_params, Appender -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, maybe_read_encoded_stream _read_stata_doc = """ @@ -203,11 +203,10 @@ def __repr__(self): class StataParser(object): - def __init__(self, encoding): - if(encoding is None): - self._encoding = 'cp1252' - else: - self._encoding = encoding + _default_encoding = 'cp1252' + + def __init__(self, encoding=None): + self._encoding = encoding #type code. #-------------------- @@ -256,7 +255,7 @@ def __init__(self, encoding): } def _decode_bytes(self, str, errors=None): - if compat.PY3: + if compat.PY3 or self._encoding is not None: return str.decode(self._encoding, errors) else: return str @@ -286,7 +285,8 @@ class StataReader(StataParser): Encoding used to parse the files. Note that Stata doesn't support unicode. None defaults to cp1252. """ - def __init__(self, path_or_buf, encoding=None): + + def __init__(self, path_or_buf, encoding='cp1252'): super(StataReader, self).__init__(encoding) self.col_sizes = () self._has_string_data = False @@ -295,8 +295,6 @@ def __init__(self, path_or_buf, encoding=None): self._value_labels_read = False if isinstance(path_or_buf, str): path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252') - if encoding is not None: - self._encoding = encoding if isinstance(path_or_buf, (str, compat.text_type, bytes)): self.path_or_buf = open(path_or_buf, 'rb') @@ -403,13 +401,13 @@ def _unpack(self, fmt, byt): return d def _null_terminate(self, s): - if compat.PY3: # have bytes not strings, so must decode + if compat.PY3 or self._encoding is not None: # have bytes not strings, so must decode null_byte = b"\0" try: s = s[:s.index(null_byte)] except: pass - return s.decode(self._encoding) + return s.decode(self._encoding or self._default_encoding) else: null_byte = "\0" try: @@ -744,7 +742,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding=" if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._file = _open_file_binary_write(fname, self._encoding) + self._file = _open_file_binary_write(fname, self._encoding or self._default_encoding) self.type_converters = {253: np.long, 252: int} def _write(self, to_write): @@ -752,7 +750,7 @@ def _write(self, to_write): Helper to call encode before writing to file for Python 3 compat. """ if compat.PY3: - self._file.write(to_write.encode(self._encoding)) + self._file.write(to_write.encode(self._encoding or self._default_encoding)) else: self._file.write(to_write) diff --git a/pandas/io/tests/data/stata1_encoding.dta b/pandas/io/tests/data/stata1_encoding.dta new file mode 100644 index 0000000000000000000000000000000000000000..b4230eda73e06b49c18e8c685d15ee5f5199e021 GIT binary patch literal 3507 zcmXS7Vq{=sn9cwM^&o-)$YWp-n6cCm#Qspi2BH`g7C`t8(`PYG2daCJ0}%(Q1p(*H zrVb!FLG~n=R(N*8fx*a7!Lc-5!N|bSSi#WT%EXL8D7z>%v)ItW#FC*nJ~_WMuLPtC z6)-cX8d?;iic!i7C@L*3&P>cxa0b~~soZLfax(MM|>mV4!jV02;b2 Ap8x;= literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 31472dc667847..77ddafe838da4 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -13,7 +13,7 @@ from pandas.io.stata import read_stata, StataReader import pandas.util.testing as tm from pandas.util.misc import is_little_endian - +from pandas import compat class StataTests(unittest.TestCase): @@ -32,6 +32,7 @@ def setUp(self): self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -202,6 +203,21 @@ def test_stata_doc_examples(self): df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path) + def test_encoding(self): + + # GH 4626, proper encoding handling + raw = read_stata(self.dta_encoding) + encoded = read_stata(self.dta_encoding, encoding="latin-1") + result = encoded.kreis1849[0] + + if compat.PY3: + expected = raw.kreis1849[0] + self.assert_(result == expected) + self.assert_(isinstance(result,compat.string_types)) + else: + expected = raw.kreis1849.str.decode("latin-1")[0] + self.assert_(result == expected) + self.assert_(isinstance(result,unicode)) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],