diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 5bd0f46dd0b18..ddd97c8d1b199 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -384,4 +384,6 @@ Bug Fixes - Bug in operator equal on Index not being consistent with Series (:issue:`9947`) -- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). \ No newline at end of file +- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). + +- Bug in `read_msgpack` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index f5e000449f232..847a7c4f90216 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -357,6 +357,7 @@ def encode(obj): 'klass': obj.__class__.__name__, 'axes': data.axes, 'blocks': [{'items': data.items.take(b.mgr_locs), + 'locs': b.mgr_locs.as_array, 'values': convert(b.values), 'shape': b.values.shape, 'dtype': b.dtype.num, @@ -485,9 +486,15 @@ def decode(obj): def create_block(b): values = unconvert(b['values'], dtype_for(b['dtype']), b['compress']).reshape(b['shape']) + + # locs handles duplicate column names, and should be used instead of items; see GH 9618 + if 'locs' in b: + placement = b['locs'] + else: + placement = axes[0].get_indexer(b['items']) return make_block(values=values, klass=getattr(internals, b['klass']), - placement=axes[0].get_indexer(b['items'])) + placement=placement) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack b/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack new file mode 100644 index 0000000000000..6bf1b9b9afaaa Binary files /dev/null and b/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack differ diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack b/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack new file mode 100644 index 0000000000000..6607570797846 Binary files /dev/null and b/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack differ diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle b/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle new file mode 100644 index 0000000000000..60101c2f1e95e Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle b/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle new file mode 100644 index 0000000000000..6d5451f96e20d Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle differ diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py deleted file mode 100644 index 2d93ecf38a76d..0000000000000 --- a/pandas/io/tests/generate_legacy_pickles.py +++ /dev/null @@ -1,167 +0,0 @@ -""" self-contained to write legacy pickle files """ -from __future__ import print_function - - -def _create_sp_series(): - - import numpy as np - from pandas import SparseSeries - - nan = np.nan - - # nan-based - arr = np.arange(15, dtype=np.float64) - arr[7:12] = nan - arr[-1:] = nan - - bseries = SparseSeries(arr, kind='block') - bseries.name = 'bseries' - return bseries - -def _create_sp_tsseries(): - - import numpy as np - from pandas import bdate_range, SparseTimeSeries - - nan = np.nan - - # nan-based - arr = np.arange(15, dtype=np.float64) - arr[7:12] = nan - arr[-1:] = nan - - date_index = bdate_range('1/1/2011', periods=len(arr)) - bseries = SparseTimeSeries(arr, index=date_index, kind='block') - bseries.name = 'btsseries' - return bseries - -def _create_sp_frame(): - import numpy as np - from pandas import bdate_range, SparseDataFrame - - nan = np.nan - - data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10).astype(np.int64), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} - - dates = bdate_range('1/1/2011', periods=10) - return SparseDataFrame(data, index=dates) - -def create_data(): - """ create the pickle data """ - - from distutils.version import LooseVersion - import numpy as np - import pandas - from pandas import (Series,TimeSeries,DataFrame,Panel, - SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, - Index,MultiIndex,PeriodIndex, - date_range,period_range,bdate_range,Timestamp,Categorical) - nan = np.nan - - data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E' : [0., 1, Timestamp('20100101'),'foo',2.], - } - - index = dict(int = Index(np.arange(10)), - date = date_range('20130101',periods=10), - period = period_range('2013-01-01', freq='M', periods=10)) - - mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), - names=['first', 'second'])) - series = dict(float = Series(data['A']), - int = Series(data['B']), - mixed = Series(data['E']), - ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)), - mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], - [3,4,3,4,5]])), - names=['one','two'])), - dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), - cat=Series(Categorical(['foo', 'bar', 'baz']))) - - frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), - int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), - mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])), - mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)), - index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], - ['one','two','one','two','three']])), - names=['first','second'])), - dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A']), - cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), - cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), - B=np.arange(3).astype(np.int64))), - ) - panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), - dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), - items=['A', 'B', 'A'])) - - if LooseVersion(pandas.__version__) >= '0.14.1': - # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and - # panels if their columns/items were non-unique. - mixed_dup_df = DataFrame(data) - mixed_dup_df.columns = list("ABCDA") - - mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) - mixed_dup_panel.items = ['ItemA', 'ItemA'] - - frame['mixed_dup'] = mixed_dup_df - panel['mixed_dup'] = mixed_dup_panel - - return dict( series = series, - frame = frame, - panel = panel, - index = index, - mi = mi, - sp_series = dict(float = _create_sp_series(), - ts = _create_sp_tsseries()), - sp_frame = dict(float = _create_sp_frame()) - ) - -def write_legacy_pickles(): - - # force our cwd to be the first searched - import sys - sys.path.insert(0,'.') - - import os, os.path - import numpy as np - import pandas - import pandas.util.testing as tm - import platform as pl - - # make sure we are < 0.13 compat (in py3) - try: - from pandas.compat import zip, cPickle as pickle - except: - import pickle - - version = pandas.__version__ - if len(sys.argv) != 2: - exit("Specify output directory: generate_legacy_pickles.py ") - - output_dir = str(sys.argv[1]) - - print("This script generates a pickle file for the current arch, system, and python version") - print(" pandas version: {0}".format(version)) - print(" output dir : {0}".format(output_dir)) - - # construct a reasonable platform name - f = '_'.join([ str(version), str(pl.machine()), str(pl.system().lower()), str(pl.python_version()) ]) - pth = '{0}.pickle'.format(f) - - fh = open(os.path.join(output_dir,pth),'wb') - pickle.dump(create_data(),fh,pickle.HIGHEST_PROTOCOL) - fh.close() - - print("created pickle file: %s" % pth) - -if __name__ == '__main__': - write_legacy_pickles() diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py new file mode 100644 index 0000000000000..e7cc89fcc0b61 --- /dev/null +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -0,0 +1,205 @@ +""" self-contained to write legacy storage (pickle/msgpack) files """ +from __future__ import print_function +from distutils.version import LooseVersion +from pandas import (Series, TimeSeries, DataFrame, Panel, + SparseSeries, SparseTimeSeries, SparseDataFrame, SparsePanel, + Index, MultiIndex, PeriodIndex, bdate_range, to_msgpack, + date_range, period_range, bdate_range, Timestamp, Categorical) +import os +import sys +import numpy as np +import pandas +import pandas.util.testing as tm +import platform as pl + + +def _create_sp_series(): + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + arr[7:12] = nan + arr[-1:] = nan + + bseries = SparseSeries(arr, kind='block') + bseries.name = 'bseries' + return bseries + + +def _create_sp_tsseries(): + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + arr[7:12] = nan + arr[-1:] = nan + + date_index = bdate_range('1/1/2011', periods=len(arr)) + bseries = SparseTimeSeries(arr, index=date_index, kind='block') + bseries.name = 'btsseries' + return bseries + + +def _create_sp_frame(): + nan = np.nan + + data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10).astype(np.int64), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + dates = bdate_range('1/1/2011', periods=10) + return SparseDataFrame(data, index=dates) + + +def create_data(): + """ create the pickle/msgpack data """ + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] + } + + index = dict(int=Index(np.arange(10)), + date=date_range('20130101', periods=10), + period=period_range('2013-01-01', freq='M', periods=10)) + + mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), + names=['first', 'second'])) + series = dict(float=Series(data['A']), + int=Series(data['B']), + mixed=Series(data['E']), + ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), + mi=Series(np.arange(5).astype(np.float64), + index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), + names=['one', 'two'])), + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), + cat=Series(Categorical(['foo', 'bar', 'baz']))) + + mixed_dup_df = DataFrame(data) + mixed_dup_df.columns = list("ABCDA") + frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), + int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), + mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), + mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), + index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], + ['one', 'two', 'one', 'two', 'three']])), + names=['first', 'second'])), + dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), + columns=['A', 'B', 'A']), + cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), + cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), + B=np.arange(3).astype(np.int64))), + mixed_dup=mixed_dup_df) + + mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) + mixed_dup_panel.items = ['ItemA', 'ItemA'] + panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), + dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), + items=['A', 'B', 'A']), + mixed_dup=mixed_dup_panel) + + return dict(series=series, + frame=frame, + panel=panel, + index=index, + mi=mi, + sp_series=dict(float=_create_sp_series(), + ts=_create_sp_tsseries()), + sp_frame=dict(float=_create_sp_frame())) + + +def create_pickle_data(): + data = create_data() + + # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and + # panels if their columns/items were non-unique. + if LooseVersion(pandas.__version__) < '0.14.1': + del data['frame']['mixed_dup'] + del data['panel']['mixed_dup'] + return data + + +def create_msgpack_data(): + data = create_data() + if LooseVersion(pandas.__version__) < '0.17.0': + del data['frame']['mixed_dup'] + del data['panel']['mixed_dup'] + del data['frame']['dup'] + del data['panel']['dup'] + # Not supported + del data['sp_series'] + del data['sp_frame'] + del data['series']['cat'] + del data['frame']['cat_onecol'] + del data['frame']['cat_and_float'] + return data + + +def platform_name(): + return '_'.join([str(pandas.__version__), str(pl.machine()), str(pl.system().lower()), str(pl.python_version())]) + + +def write_legacy_pickles(output_dir): + + # make sure we are < 0.13 compat (in py3) + try: + from pandas.compat import zip, cPickle as pickle + except: + import pickle + + version = pandas.__version__ + + print("This script generates a storage file for the current arch, system, and python version") + print(" pandas version: {0}".format(version)) + print(" output dir : {0}".format(output_dir)) + print(" storage format: pickle") + + pth = '{0}.pickle'.format(platform_name()) + + fh = open(os.path.join(output_dir, pth), 'wb') + pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) + fh.close() + + print("created pickle file: %s" % pth) + + +def write_legacy_msgpack(output_dir): + + version = pandas.__version__ + + print("This script generates a storage file for the current arch, system, and python version") + print(" pandas version: {0}".format(version)) + print(" output dir : {0}".format(output_dir)) + print(" storage format: msgpack") + + pth = '{0}.msgpack'.format(platform_name()) + to_msgpack(os.path.join(output_dir, pth), create_msgpack_data()) + + print("created msgpack file: %s" % pth) + + +def write_legacy_file(): + # force our cwd to be the first searched + sys.path.insert(0, '.') + + if len(sys.argv) != 3: + exit("Specify output directory and storage type: generate_legacy_storage_files.py ") + + output_dir = str(sys.argv[1]) + storage_type = str(sys.argv[2]) + + if storage_type == 'pickle': + write_legacy_pickles(output_dir=output_dir) + elif storage_type == 'msgpack': + write_legacy_msgpack(output_dir=output_dir) + else: + exit("storage_type must be one of {'pickle', 'msgpack'}") + + +if __name__ == '__main__': + write_legacy_file() diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 9f1fd41e90413..33b7cc79083db 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -1,5 +1,6 @@ import nose +import os import datetime import numpy as np import sys @@ -11,7 +12,7 @@ date_range, period_range, Index, SparseSeries, SparseDataFrame, SparsePanel) import pandas.util.testing as tm -from pandas.util.testing import ensure_clean +from pandas.util.testing import ensure_clean, assert_index_equal from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal from pandas.tests.test_panel import assert_panel_equal @@ -39,6 +40,8 @@ def check_arbitrary(a, b): assert_frame_equal(a, b) elif isinstance(a, Series): assert_series_equal(a, b) + elif isinstance(a, Index): + assert_index_equal(a, b) else: assert(a == b) @@ -396,6 +399,24 @@ def tests_datetimeindex_freq_issue(self): result = self.encode_decode(df) assert_frame_equal(result, df) + def test_dataframe_duplicate_column_names(self): + + # GH 9618 + expected_1 = DataFrame(columns=['a', 'a']) + expected_2 = DataFrame(columns=[1]*100) + expected_2.loc[0] = np.random.randn(100) + expected_3 = DataFrame(columns=[1, 1]) + expected_3.loc[0] = ['abc', np.nan] + + result_1 = self.encode_decode(expected_1) + result_2 = self.encode_decode(expected_2) + result_3 = self.encode_decode(expected_3) + + assert_frame_equal(result_1, expected_1) + assert_frame_equal(result_2, expected_2) + assert_frame_equal(result_3, expected_3) + + class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): @@ -496,6 +517,58 @@ def test_compression_blosc(self): assert_frame_equal(self.frame[k], i_rec[k]) +class TestMsgpack(): + """ + How to add msgpack tests: + + 1. Install pandas version intended to output the msgpack. + + 2. Execute "generate_legacy_storage_files.py" to create the msgpack. + $ python generate_legacy_storage_files.py msgpack + + 3. Move the created pickle to "data/legacy_msgpack/" directory. + + NOTE: TestMsgpack can't be a subclass of tm.Testcase to use test generator. + http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class + """ + def setUp(self): + from pandas.io.tests.generate_legacy_storage_files import create_msgpack_data + self.data = create_msgpack_data() + self.path = u('__%s__.msgpack' % tm.rands(10)) + + def compare(self, vf): + data = read_msgpack(vf) + for typ, dv in data.items(): + for dt, result in dv.items(): + try: + expected = self.data[typ][dt] + except KeyError: + continue + check_arbitrary(result, expected) + + return data + + def read_msgpacks(self, version): + + pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) + n = 0 + for f in os.listdir(pth): + vf = os.path.join(pth, f) + self.compare(vf) + n += 1 + assert n > 0, 'Msgpack files are not tested' + + def test_msgpack(self): + msgpack_path = tm.get_data_path('legacy_msgpack') + n = 0 + for v in os.listdir(msgpack_path): + pth = os.path.join(msgpack_path, v) + if os.path.isdir(pth): + yield self.read_msgpacks, v + n += 1 + assert n > 0, 'Msgpack files are not tested' + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index d1396463f3b23..e691fac215002 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -24,8 +24,8 @@ class TestPickle(): 1. Install pandas version intended to output the pickle. - 2. Execute "generate_legacy_pkcles.py" to create the pickle. - $ python generate_legacy_pickles.py + 2. Execute "generate_legacy_storage_files.py" to create the pickle. + $ python generate_legacy_storage_files.py pickle 3. Move the created pickle to "data/legacy_pickle/" directory. @@ -35,8 +35,8 @@ class TestPickle(): _multiprocess_can_split_ = True def setUp(self): - from pandas.io.tests.generate_legacy_pickles import create_data - self.data = create_data() + from pandas.io.tests.generate_legacy_storage_files import create_pickle_data + self.data = create_pickle_data() self.path = u('__%s__.pickle' % tm.rands(10)) def compare_element(self, typ, result, expected): diff --git a/setup.py b/setup.py index f20b0ac0a5fb5..30c5d1052d9b3 100755 --- a/setup.py +++ b/setup.py @@ -537,6 +537,7 @@ def pxd(name): ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', 'tests/data/legacy_pickle/*/*.pickle', + 'tests/data/legacy_msgpack/*/*.msgpack', 'tests/data/*.csv*', 'tests/data/*.dta', 'tests/data/*.txt',