Skip to content

Commit 9f6a2ed

Browse files
committed
BUG: fixed string appending when length of subsequent is longer/shorter that existing
removed meta data saving disable memory tests (and put a try:except: around it)
1 parent 71a4420 commit 9f6a2ed

File tree

2 files changed

+89
-29
lines changed

2 files changed

+89
-29
lines changed

pandas/io/pytables.py

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from pandas.tseries.api import PeriodIndex, DatetimeIndex
2222
from pandas.core.common import adjoin
2323
from pandas.core.algorithms import match, unique
24-
24+
from pandas.core.strings import str_len
2525
from pandas.core.categorical import Factor
2626
from pandas.core.common import _asarray_tuplesafe, _try_sort
2727
from pandas.core.internals import BlockManager, make_block, form_blocks
@@ -507,7 +507,7 @@ def _write_to_group(self, key, value, table=False, append=False,
507507
wrapper(value)
508508
group._v_attrs.pandas_type = kind
509509
group._v_attrs.pandas_version = _version
510-
group._v_attrs.meta = getattr(value,'meta',None)
510+
#group._v_attrs.meta = getattr(value,'meta',None)
511511

512512
def _write_series(self, group, series):
513513
self._write_index(group, 'index', series.index)
@@ -848,10 +848,10 @@ def _read_group(self, group, where=None, **kwargs):
848848
kind = _LEGACY_MAP.get(kind, kind)
849849
handler = self._get_handler(op='read', kind=kind)
850850
v = handler(group, where, **kwargs)
851-
if v is not None:
852-
meta = getattr(group._v_attrs,'meta',None)
853-
if meta is not None:
854-
v.meta = meta
851+
#if v is not None:
852+
# meta = getattr(group._v_attrs,'meta',None)
853+
# if meta is not None:
854+
# v.meta = meta
855855
return v
856856

857857
def _read_series(self, group, where=None):
@@ -1001,16 +1001,22 @@ def validate_and_set(self, table, append, **kwargs):
10011001
self.validate_attr(append)
10021002
self.set_attr()
10031003

1004-
def validate_col(self):
1005-
""" validate this column & set table data for it """
1004+
def validate_col(self, itemsize = None):
1005+
""" validate this column: return the compared against itemsize """
10061006

10071007
# validate this column for string truncation (or reset to the max size)
1008-
if self.kind == 'string':
1008+
dtype = getattr(self,'dtype',None)
1009+
if self.kind == 'string' or (dtype is not None and dtype.startswith('string')):
10091010

10101011
c = self.col
10111012
if c is not None:
1012-
if c.itemsize < self.itemsize:
1013-
raise Exception("[%s] column has a min_itemsize of [%s] but itemsize [%s] is required!" % (self.cname,self.itemsize,c.itemsize))
1013+
if itemsize is None:
1014+
itemsize = self.itemsize
1015+
if c.itemsize < itemsize:
1016+
raise Exception("[%s] column has a min_itemsize of [%s] but itemsize [%s] is required!" % (self.cname,itemsize,c.itemsize))
1017+
return c.itemsize
1018+
1019+
return None
10141020

10151021

10161022
def validate_attr(self, append):
@@ -1404,26 +1410,35 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None):
14041410
# a string column
14051411
if b.dtype.name == 'object':
14061412

1413+
# itemsize is the maximum length of a string (along any dimension)
1414+
itemsize = _itemsize_string_array(values)
1415+
14071416
# specified min_itemsize?
14081417
if isinstance(min_itemsize, dict):
1409-
min_itemsize = int(min_itemsize.get('values'))
1418+
itemsize = max(int(min_itemsize.get('values')),itemsize)
1419+
1420+
# check for column in the values conflicts
1421+
if existing_table is not None and validate:
1422+
eci = existing_table.values_axes[i].validate_col(itemsize)
1423+
if eci > itemsize:
1424+
itemsize = eci
14101425

1411-
if min_itemsize is None:
1412-
min_itemsize = values.dtype.itemsize
1426+
atom = _tables().StringCol(itemsize = itemsize, shape = shape)
1427+
utype = 'S%s' % itemsize
1428+
kind = 'string'
14131429

1414-
atom = _tables().StringCol(itemsize = min_itemsize, shape = shape)
1415-
utype = 'S%s' % min_itemsize
14161430
else:
14171431
atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = shape)
14181432
utype = atom._deftype
1433+
kind = b.dtype.name
14191434

14201435
# coerce data to this type
14211436
try:
14221437
values = values.astype(utype)
14231438
except (Exception), detail:
14241439
raise Exception("cannot coerce data type -> [dtype->%s]" % b.dtype.name)
14251440

1426-
dc = DataCol.create_for_block(i = i, values = list(b.items), kind = b.dtype.name, typ = atom, data = values, pos = j)
1441+
dc = DataCol.create_for_block(i = i, values = list(b.items), kind = kind, typ = atom, data = values, pos = j)
14271442
j += 1
14281443
self.values_axes.append(dc)
14291444

@@ -1663,7 +1678,6 @@ def write_data(self):
16631678
""" fast writing of data: requires specific cython routines each axis shape """
16641679

16651680
# create the masks & values
1666-
#import pdb; pdb.set_trace()
16671681
masks = []
16681682
for a in self.values_axes:
16691683

@@ -1694,7 +1708,6 @@ def write_data(self):
16941708
if len(rows):
16951709
self.table.append(rows)
16961710
except (Exception), detail:
1697-
#import pdb; pdb.set_trace()
16981711
raise Exception("tables cannot write this data -> %s" % str(detail))
16991712

17001713
def delete(self, where = None):
@@ -1849,6 +1862,10 @@ def create_table(parent, group, typ = None, **kwargs):
18491862
return _TABLE_MAP.get(tt)(parent, group, **kwargs)
18501863

18511864

1865+
def _itemsize_string_array(arr):
1866+
""" return the maximum size of elements in a strnig array """
1867+
return max([ str_len(arr[v]).max() for v in range(arr.shape[0]) ])
1868+
18521869
def _convert_index(index):
18531870
if isinstance(index, DatetimeIndex):
18541871
converted = index.asi8
@@ -2247,14 +2264,20 @@ def f(values, freq=None, tz=None):
22472264

22482265
def create_debug_memory(parent):
22492266
_debug_memory = getattr(parent,'_debug_memory',False)
2267+
def get_memory(s):
2268+
pass
2269+
22502270
if not _debug_memory:
2251-
def get_memory(s):
2252-
pass
2271+
pass
22532272
else:
2254-
import psutil, os
2255-
def get_memory(s):
2256-
p = psutil.Process(os.getpid())
2257-
(rss,vms) = p.get_memory_info()
2258-
mp = p.get_memory_percent()
2259-
print "[%s] cur_mem->%.2f (MB),per_mem->%.2f" % (s,rss/1000000.0,mp)
2273+
try:
2274+
import psutil, os
2275+
def get_memory(s):
2276+
p = psutil.Process(os.getpid())
2277+
(rss,vms) = p.get_memory_info()
2278+
mp = p.get_memory_percent()
2279+
print "[%s] cur_mem->%.2f (MB),per_mem->%.2f" % (s,rss/1000000.0,mp)
2280+
except:
2281+
pass
2282+
22602283
return get_memory

pandas/io/tests/test_pytables.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ def test_versioning(self):
9898
self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10')
9999

100100
def test_meta(self):
101+
raise nose.SkipTest('no meta')
102+
101103
meta = { 'foo' : [ 'I love pandas ' ] }
102104
s = tm.makeTimeSeries()
103105
s.meta = meta
@@ -167,6 +169,29 @@ def test_put(self):
167169
self.store.put('c', df[:10], table=True, append=False)
168170
tm.assert_frame_equal(df[:10], self.store['c'])
169171

172+
def test_put_string_index(self):
173+
174+
index = Index([ "I am a very long string index: %s" % i for i in range(20) ])
175+
s = Series(np.arange(20), index = index)
176+
df = DataFrame({ 'A' : s, 'B' : s })
177+
178+
self.store['a'] = s
179+
tm.assert_series_equal(self.store['a'], s)
180+
181+
self.store['b'] = df
182+
tm.assert_frame_equal(self.store['b'], df)
183+
184+
# mixed length
185+
index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + [ "I am a very long string index: %s" % i for i in range(20) ])
186+
s = Series(np.arange(21), index = index)
187+
df = DataFrame({ 'A' : s, 'B' : s })
188+
self.store['a'] = s
189+
tm.assert_series_equal(self.store['a'], s)
190+
191+
self.store['b'] = df
192+
tm.assert_frame_equal(self.store['b'], df)
193+
194+
170195
def test_put_compression(self):
171196
df = tm.makeTimeDataFrame()
172197

@@ -325,11 +350,22 @@ def test_append_with_strings(self):
325350
self.store.append('df_big',df, min_itemsize = { 'values' : 1024 })
326351
tm.assert_frame_equal(self.store.select('df_big'), df)
327352

353+
# appending smaller string ok
354+
df2 = DataFrame([[124,'asdqy'], [346,'dggnhefbdfb']])
355+
self.store.append('df_big',df2)
356+
expected = concat([ df, df2 ])
357+
tm.assert_frame_equal(self.store.select('df_big'), expected)
358+
328359
# avoid truncation on elements
329360
df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']])
330-
self.store.append('df_big2',df, min_itemsize = { 'values' : 300 })
361+
self.store.append('df_big2',df, min_itemsize = { 'values' : 10 })
331362
tm.assert_frame_equal(self.store.select('df_big2'), df)
332363

364+
# bigger string on next append
365+
self.store.append('df_new',df, min_itemsize = { 'values' : 16 })
366+
df_new = DataFrame([[124,'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']])
367+
self.assertRaises(Exception, self.store.append, 'df_new',df_new)
368+
333369
def test_create_table_index(self):
334370
wp = tm.makePanel()
335371
self.store.append('p5', wp)
@@ -375,7 +411,8 @@ def test_create_table_index(self):
375411

376412

377413
def test_big_table(self):
378-
414+
raise nose.SkipTest('no big table')
415+
379416
# create and write a big table
380417
wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ],
381418
major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ])

0 commit comments

Comments
 (0)