From 1ac9d2bfe5783840351e2e184545f61dfc547418 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 28 Jan 2014 21:26:48 -0500 Subject: [PATCH] BUG: allow lex string comparisons --- doc/source/enhancingperf.rst | 34 ++++++++++++++++++++++++++-------- doc/source/release.rst | 2 ++ pandas/computation/expr.py | 3 ++- pandas/tests/test_frame.py | 19 +++++++++++++++++++ 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index ddf1fa62b2d61..066fcce64c5ac 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -464,19 +464,20 @@ evaluate an expression in the "context" of a ``DataFrame``. Any expression that is a valid :func:`~pandas.eval` expression is also a valid ``DataFrame.eval`` expression, with the added benefit that *you don't have to -prefix the name of the* ``DataFrame`` *to the column you're interested in +prefix the name of the* ``DataFrame`` *to the column(s) you're interested in evaluating*. -In addition, you can perform in-line assignment of columns within an expression. -This can allow for *formulaic evaluation*. Only a signle assignement is permitted. -It can be a new column name or an existing column name. It must be a string-like. +In addition, you can perform assignment of columns within an expression. +This allows for *formulaic evaluation*. Only a single assignment is permitted. +The assignment target can be a new column name or an existing column name, and +it must be a valid Python identifier. .. ipython:: python - df = DataFrame(dict(a = range(5), b = range(5,10))) - df.eval('c=a+b') - df.eval('d=a+b+c') - df.eval('a=1') + df = DataFrame(dict(a=range(5), b=range(5, 10))) + df.eval('c = a + b') + df.eval('d = a + b + c') + df.eval('a = 1') df Local Variables @@ -616,3 +617,20 @@ different engines. This plot was created using a ``DataFrame`` with 3 columns each containing floating point values generated using ``numpy.random.randn()``. + +Technical Minutia +~~~~~~~~~~~~~~~~~ +- Expressions that would result in an object dtype (including simple + variable evaluation) have to be evaluated in Python space. The main reason + for this behavior is to maintain backwards compatbility with versions of + numpy < 1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` + will truncate any strings that are more than 60 characters in length. Second, + we can't pass ``object`` arrays to ``numexpr`` thus string comparisons must + be evaluated in Python space. +- The upshot is that this *only* applies to object-dtype'd expressions. So, + if you have an expression--for example--that's a string comparison + ``and``-ed together with another boolean expression that's from a numeric + comparison, the numeric comparison will be evaluated by ``numexpr``. In fact, + in general, :func:`~pandas.query`/:func:`~pandas.eval` will "pick out" the + subexpressions that are ``eval``-able by ``numexpr`` and those that must be + evaluated in Python space transparently to the user. diff --git a/doc/source/release.rst b/doc/source/release.rst index 2caad982f044b..a79182af13955 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -168,6 +168,8 @@ Bug Fixes - Bug in DataFrame construction with recarray and non-ns datetime dtype (:issue:`6140`) - Bug in ``.loc`` setitem indexing with a datafrme on rhs, multiple item setting, and a datetimelike (:issue:`6152`) + - Fixed a stack overflow bug in ``query``/``eval`` during lexicographic + string comparisons (:issue:`6155`). pandas 0.13.0 ------------- diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index c16205ff34b1f..7ebffef7368a0 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -508,7 +508,8 @@ def _possibly_eval(self, binop, eval_in_python): def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, eval_in_python=('in', 'not in'), - maybe_eval_in_python=('==', '!=')): + maybe_eval_in_python=('==', '!=', '<', '>', + '<=', '>=')): res = op(lhs, rhs) if self.engine != 'pytables': diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 50c5cca935553..4578375ab7dad 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12841,6 +12841,25 @@ def test_query_with_nested_string(self): for parser, engine in product(PARSERS, ENGINES): yield self.check_query_with_nested_strings, parser, engine + def check_query_lex_compare_strings(self, parser, engine): + tm.skip_if_no_ne(engine=engine) + import operator as opr + + a = Series(tm.choice(list('abcde'), 20)) + b = Series(np.arange(a.size)) + df = DataFrame({'X': a, 'Y': b}) + + ops = {'<': opr.lt, '>': opr.gt, '<=': opr.le, '>=': opr.ge} + + for op, func in ops.items(): + res = df.query('X %s "d"' % op, engine=engine, parser=parser) + expected = df[func(df.X, 'd')] + assert_frame_equal(res, expected) + + def test_query_lex_compare_strings(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_lex_compare_strings, parser, engine + class TestDataFrameEvalNumExprPandas(tm.TestCase): @classmethod