diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 0223a11d8a011..a367147203e3f 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -463,7 +463,7 @@ Bug Fixes - Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`) - +- Bug in HDFStore iteration when passing a where (:issue:`8014`) - Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5150729ed6f79..07e9abeaadbb4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -662,21 +662,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None, s = self._create_storer(group) s.infer_axes() - # what we are actually going to do for a chunk - def func(_start, _stop): - return s.read(where=where, start=_start, stop=_stop, + # function to call on iteration + def func(_start, _stop, _where): + return s.read(start=_start, stop=_stop, + where=_where, columns=columns, **kwargs) - if iterator or chunksize is not None: - if not s.is_table: - raise TypeError( - "can only use an iterator or chunksize on a table") - return TableIterator(self, func, nrows=s.nrows, start=start, - stop=stop, chunksize=chunksize, - auto_close=auto_close) + # create the iterator + it = TableIterator(self, s, func, where=where, nrows=s.nrows, start=start, + stop=stop, iterator=iterator, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, - auto_close=auto_close).get_values() + return it.get_result() def select_as_coordinates( self, key, where=None, start=None, stop=None, **kwargs): @@ -779,26 +776,22 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, # axis is the concentation axes axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] - def func(_start, _stop): - if where is not None: - c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs) - else: - c = None + def func(_start, _stop, _where): - objs = [t.read(where=c, start=_start, stop=_stop, - columns=columns, **kwargs) for t in tbls] + # retrieve the objs, _where is always passed as a set of coordinates here + objs = [t.read(where=_where, columns=columns, **kwargs) for t in tbls] # concat and return return concat(objs, axis=axis, verify_integrity=False).consolidate() - if iterator or chunksize is not None: - return TableIterator(self, func, nrows=nrows, start=start, - stop=stop, chunksize=chunksize, - auto_close=auto_close) + # create the iterator + it = TableIterator(self, s, func, where=where, nrows=nrows, start=start, + stop=stop, iterator=iterator, chunksize=chunksize, + auto_close=auto_close) + + return it.get_result(coordinates=True) - return TableIterator(self, func, nrows=nrows, start=start, stop=stop, - auto_close=auto_close).get_values() def put(self, key, value, format=None, append=False, **kwargs): """ @@ -1293,20 +1286,25 @@ class TableIterator(object): ---------- store : the reference store - func : the function to get results + s : the refered storer + func : the function to execute the query + where : the where of the query nrows : the rows to iterate on start : the passed start value (default is None) stop : the passed stop value (default is None) - chunksize : the passed chunking valeu (default is 50000) + iterator : boolean, whether to use the default iterator + chunksize : the passed chunking value (default is 50000) auto_close : boolean, automatically close the store at the end of iteration, default is False kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, stop=None, - chunksize=None, auto_close=False): + def __init__(self, store, s, func, where, nrows, start=None, stop=None, + iterator=False, chunksize=None, auto_close=False): self.store = store - self.func = func + self.s = s + self.func = func + self.where = where self.nrows = nrows or 0 self.start = start or 0 @@ -1314,23 +1312,29 @@ def __init__(self, store, func, nrows, start=None, stop=None, stop = self.nrows self.stop = min(self.nrows, stop) - if chunksize is None: - chunksize = 100000 + self.coordinates = None + if iterator or chunksize is not None: + if chunksize is None: + chunksize = 100000 + self.chunksize = int(chunksize) + else: + self.chunksize = None - self.chunksize = chunksize self.auto_close = auto_close def __iter__(self): + + # iterate current = self.start while current < self.stop: - stop = current + self.chunksize - v = self.func(current, stop) - current = stop - if v is None: + stop = min(current + self.chunksize, self.stop) + value = self.func(None, None, self.coordinates[current:stop]) + current = stop + if value is None or not len(value): continue - yield v + yield value self.close() @@ -1338,12 +1342,29 @@ def close(self): if self.auto_close: self.store.close() - def get_values(self): - results = self.func(self.start, self.stop) + def get_result(self, coordinates=False): + + # return the actual iterator + if self.chunksize is not None: + if not self.s.is_table: + raise TypeError( + "can only use an iterator or chunksize on a table") + + self.coordinates = self.s.read_coordinates(where=self.where) + + return self + + # if specified read via coordinates (necessary for multiple selections + if coordinates: + where = self.s.read_coordinates(where=self.where) + else: + where = self.where + + # directly return the result + results = self.func(self.start, self.stop, where) self.close() return results - class IndexCol(StringMixin): """ an index column description class diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9cdecd16755c7..c1419ef2d023e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -3264,21 +3264,16 @@ def test_select_iterator(self): expected = store.select('df') - results = [] - for s in store.select('df',iterator=True): - results.append(s) + results = [ s for s in store.select('df',iterator=True) ] result = concat(results) tm.assert_frame_equal(expected, result) - results = [] - for s in store.select('df',chunksize=100): - results.append(s) + + results = [ s for s in store.select('df',chunksize=100) ] self.assertEqual(len(results), 5) result = concat(results) tm.assert_frame_equal(expected, result) - results = [] - for s in store.select('df',chunksize=150): - results.append(s) + results = [ s for s in store.select('df',chunksize=150) ] result = concat(results) tm.assert_frame_equal(result, expected) @@ -3294,12 +3289,10 @@ def test_select_iterator(self): df = tm.makeTimeDataFrame(500) df.to_hdf(path,'df',format='table') - results = [] - for x in read_hdf(path,'df',chunksize=100): - results.append(x) + results = [ s for s in read_hdf(path,'df',chunksize=100) ] + result = concat(results) self.assertEqual(len(results), 5) - result = concat(results) tm.assert_frame_equal(result, df) tm.assert_frame_equal(result, read_hdf(path,'df')) @@ -3318,10 +3311,8 @@ def test_select_iterator(self): # full selection expected = store.select_as_multiple( ['df1', 'df2'], selector='df1') - results = [] - for s in store.select_as_multiple( - ['df1', 'df2'], selector='df1', chunksize=150): - results.append(s) + results = [ s for s in store.select_as_multiple( + ['df1', 'df2'], selector='df1', chunksize=150) ] result = concat(results) tm.assert_frame_equal(expected, result) @@ -3335,6 +3326,185 @@ def test_select_iterator(self): #result = concat(results) #tm.assert_frame_equal(expected, result) + def test_select_iterator_complete_8014(self): + + # GH 8014 + # using iterator and where clause + chunksize=1e4 + + # no iterator + with ensure_clean_store(self.path) as store: + + expected = tm.makeTimeDataFrame(100064, 'S') + _maybe_remove(store, 'df') + store.append('df',expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/o iteration and no where clause works + result = store.select('df') + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, begin + # of range, works + where = "index >= '%s'" % beg_dt + result = store.select('df',where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, end + # of range, works + where = "index <= '%s'" % end_dt + result = store.select('df',where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, inclusive range, + # works + where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + result = store.select('df',where=where) + tm.assert_frame_equal(expected, result) + + # with iterator, full range + with ensure_clean_store(self.path) as store: + + expected = tm.makeTimeDataFrame(100064, 'S') + _maybe_remove(store, 'df') + store.append('df',expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/iterator and no where clause works + results = [ s for s in store.select('df',chunksize=chunksize) ] + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, begin of range + where = "index >= '%s'" % beg_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '%s'" % end_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + tm.assert_frame_equal(expected, result) + + def test_select_iterator_non_complete_8014(self): + + # GH 8014 + # using iterator and where clause + chunksize=1e4 + + # with iterator, non complete range + with ensure_clean_store(self.path) as store: + + expected = tm.makeTimeDataFrame(100064, 'S') + _maybe_remove(store, 'df') + store.append('df',expected) + + beg_dt = expected.index[1] + end_dt = expected.index[-2] + + # select w/iterator and where clause, single term, begin of range + where = "index >= '%s'" % beg_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '%s'" % end_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + tm.assert_frame_equal(rexpected, result) + + # with iterator, empty where + with ensure_clean_store(self.path) as store: + + expected = tm.makeTimeDataFrame(100064, 'S') + _maybe_remove(store, 'df') + store.append('df',expected) + + end_dt = expected.index[-1] + + # select w/iterator and where clause, single term, begin of range + where = "index > '%s'" % end_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + self.assertEqual(0, len(results)) + + def test_select_iterator_many_empty_frames(self): + + # GH 8014 + # using iterator and where clause can return many empty + # frames. + chunksize=int(1e4) + + # with iterator, range limited to the first chunk + with ensure_clean_store(self.path) as store: + + expected = tm.makeTimeDataFrame(100000, 'S') + _maybe_remove(store, 'df') + store.append('df',expected) + + beg_dt = expected.index[0] + end_dt = expected.index[chunksize-1] + + # select w/iterator and where clause, single term, begin of range + where = "index >= '%s'" % beg_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '%s'" % end_dt + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + + tm.assert_equal(1, len(results)) + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + + # should be 1, is 10 + tm.assert_equal(1, len(results)) + result = concat(results) + rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause which selects + # *nothing*. + # + # To be consistent with Python idiom I suggest this should + # return [] e.g. `for e in []: print True` never prints + # True. + + where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt) + results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + + # should be [] + tm.assert_equal(0, len(results)) + + def test_retain_index_attributes(self): # GH 3499, losing frequency info on index recreation