From f58fb0c40942f6de5e4786a7359d0978909a6cc0 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Thu, 22 Aug 2013 13:33:04 -0400
Subject: [PATCH] BUG: (GH4626) Fix decoding based on a passed in non-default
 encoding in pd.read_stata

---
 doc/source/release.rst                   |   1 +
 pandas/io/common.py                      |  38 ++++++++++++++++-------
 pandas/io/stata.py                       |  26 +++++++---------
 pandas/io/tests/data/stata1_encoding.dta | Bin 0 -> 3507 bytes
 pandas/io/tests/test_stata.py            |  18 ++++++++++-
 5 files changed, 57 insertions(+), 26 deletions(-)
 create mode 100644 pandas/io/tests/data/stata1_encoding.dta

diff --git a/doc/source/release.rst b/doc/source/release.rst
index 16179cdeca052..929c167cd1340 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -286,6 +286,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
     the DateOffset from being cached (:issue:`4609`)
   - Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the rhs (:issue:`4576`)
   - Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` (:issue:`4667`)
+  - Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` (:issue:`4626`)
 
 pandas 0.12
 ===========
diff --git a/pandas/io/common.py b/pandas/io/common.py
index f4a4ef789510c..02242c5a91493 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -66,6 +66,32 @@ def _is_s3_url(url):
         return False
 
 
+def maybe_read_encoded_stream(reader, encoding=None):
+    """ read an encoded stream from the reader and transform the bytes to unicode
+        if required based on the encoding
+
+        Parameters
+        ----------
+        reader : a streamable file-like object
+        encoding : optional, the encoding to attempt to read
+
+        Returns
+        -------
+        a tuple of (a stream of decoded bytes, the encoding which was used)
+
+        """
+
+    if compat.PY3 or encoding is not None:  # pragma: no cover
+        if encoding:
+            errors = 'strict'
+        else:
+            errors = 'replace'
+            encoding = 'utf-8'
+        reader = StringIO(reader.read().decode(encoding, errors))
+    else:
+        encoding = None
+    return reader, encoding
+
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
     """
     If the filepath_or_buffer is a url, translate and return the buffer
@@ -83,17 +109,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
 
     if _is_url(filepath_or_buffer):
         req = _urlopen(str(filepath_or_buffer))
-        if compat.PY3:  # pragma: no cover
-            if encoding:
-                errors = 'strict'
-            else:
-                errors = 'replace'
-                encoding = 'utf-8'
-            out = StringIO(req.read().decode(encoding, errors))
-        else:
-            encoding = None
-            out = req
-        return out, encoding
+        return maybe_read_encoded_stream(req,encoding)
 
     if _is_s3_url(filepath_or_buffer):
         try:
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 9d21e10d69982..bab90a77c56e8 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -24,7 +24,7 @@
 from pandas.compat import StringIO, long, lrange, lmap, lzip
 from pandas import isnull
 from pandas.io.parsers import _parser_params, Appender
-from pandas.io.common import get_filepath_or_buffer
+from pandas.io.common import get_filepath_or_buffer, maybe_read_encoded_stream
 
 
 _read_stata_doc = """
@@ -203,11 +203,10 @@ def __repr__(self):
 
 
 class StataParser(object):
-    def __init__(self, encoding):
-        if(encoding is None):
-            self._encoding = 'cp1252'
-        else:
-            self._encoding = encoding
+    _default_encoding = 'cp1252'
+
+    def __init__(self, encoding=None):
+        self._encoding = encoding
 
         #type          code.
         #--------------------
@@ -256,7 +255,7 @@ def __init__(self, encoding):
             }
 
     def _decode_bytes(self, str, errors=None):
-        if compat.PY3:
+        if compat.PY3 or self._encoding is not None:
             return str.decode(self._encoding, errors)
         else:
             return str
@@ -286,7 +285,8 @@ class StataReader(StataParser):
         Encoding used to parse the files. Note that Stata doesn't
         support unicode. None defaults to cp1252.
     """
-    def __init__(self, path_or_buf, encoding=None):
+
+    def __init__(self, path_or_buf, encoding='cp1252'):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
         self._has_string_data = False
@@ -295,8 +295,6 @@ def __init__(self, path_or_buf, encoding=None):
         self._value_labels_read = False
         if isinstance(path_or_buf, str):
             path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
-            if encoding is not None:
-                self._encoding = encoding
 
         if isinstance(path_or_buf, (str, compat.text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
@@ -403,13 +401,13 @@ def _unpack(self, fmt, byt):
         return d
 
     def _null_terminate(self, s):
-        if compat.PY3:  # have bytes not strings, so must decode
+        if compat.PY3 or self._encoding is not None:  # have bytes not strings, so must decode
             null_byte = b"\0"
             try:
                 s = s[:s.index(null_byte)]
             except:
                 pass
-            return s.decode(self._encoding)
+            return s.decode(self._encoding or self._default_encoding)
         else:
             null_byte = "\0"
             try:
@@ -744,7 +742,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="
         if byteorder is None:
             byteorder = sys.byteorder
         self._byteorder = _set_endianness(byteorder)
-        self._file = _open_file_binary_write(fname, self._encoding)
+        self._file = _open_file_binary_write(fname, self._encoding or self._default_encoding)
         self.type_converters = {253: np.long, 252: int}
 
     def _write(self, to_write):
@@ -752,7 +750,7 @@ def _write(self, to_write):
         Helper to call encode before writing to file for Python 3 compat.
         """
         if compat.PY3:
-            self._file.write(to_write.encode(self._encoding))
+            self._file.write(to_write.encode(self._encoding or self._default_encoding))
         else:
             self._file.write(to_write)
 
diff --git a/pandas/io/tests/data/stata1_encoding.dta b/pandas/io/tests/data/stata1_encoding.dta
new file mode 100644
index 0000000000000000000000000000000000000000..b4230eda73e06b49c18e8c685d15ee5f5199e021
GIT binary patch
literal 3507
zcmXS7Vq{=sn9cwM^&o-)$YWp-n6cCm#Qspi2BH`g7C`t8(`PYG2daCJ0}%(Q1p(*H
zrVb!FLG~n=R(N*8fx*a7!Lc-5!N|bSSi#WT%EXL8D7z>%v)ItW#FC*nJ~_WMuLPtC
z6)-cX8d?;iic!i7C@L*3&P>cxa0b~~so<BGo2sCzs{nL~0Zhy#H7zqQvm`S=kAY#r
zy#%fQ(BJ^u?ZOJC{sV!VDUiwN4kY4JN)j2ATtZ!ggI#@G{Da(-7>ZLfax(MM<BL*K
zi%W{)LxM}n5{noZprHm1WuV5BZ)Sr-k)bjbD9h+cq*(=td8s)#%}UNJsmx4aV2mKr
zw3O5wp!0FLOxsY;02xGR19=RLE<_rhSyEJn+o^^`hF?i0Bptv~9VOwHNysc{_yIl0
zz~J(yxHvT@CBG<*!A*rB6__GMMMgtoG$oAYg3(+snhQpA!DucR%>|>mV4!jV02;b2
Ap8x;=

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index 31472dc667847..77ddafe838da4 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -13,7 +13,7 @@
 from pandas.io.stata import read_stata, StataReader
 import pandas.util.testing as tm
 from pandas.util.misc import is_little_endian
-
+from pandas import compat
 
 class StataTests(unittest.TestCase):
 
@@ -32,6 +32,7 @@ def setUp(self):
         self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv')
         self.dta9 = os.path.join(self.dirpath, 'lbw.dta')
         self.csv9 = os.path.join(self.dirpath, 'lbw.csv')
+        self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
 
     def read_dta(self, file):
         return read_stata(file, convert_dates=True)
@@ -202,6 +203,21 @@ def test_stata_doc_examples(self):
             df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
             df.to_stata(path)
 
+    def test_encoding(self):
+
+        # GH 4626, proper encoding handling
+        raw = read_stata(self.dta_encoding)
+        encoded = read_stata(self.dta_encoding, encoding="latin-1")
+        result = encoded.kreis1849[0]
+
+        if compat.PY3:
+            expected = raw.kreis1849[0]
+            self.assert_(result == expected)
+            self.assert_(isinstance(result,compat.string_types))
+        else:
+            expected = raw.kreis1849.str.decode("latin-1")[0]
+            self.assert_(result == expected)
+            self.assert_(isinstance(result,unicode))
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],