From cf6cbb2c113fe24a681f0ef423bfb3d143d42274 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 5 May 2014 22:59:09 -0400 Subject: [PATCH] ENH: use size instead of cythonized count for fallback cases --- pandas/core/groupby.py | 3 +- pandas/src/generate_code.py | 27 +++-- pandas/src/generated.pyx | 222 +---------------------------------- pandas/tests/test_groupby.py | 13 ++ vb_suite/groupby.py | 16 ++- 5 files changed, 52 insertions(+), 229 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 400f7e06df784..2c7f6c5e181da 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -722,8 +722,7 @@ def size(self): last = _groupby_function('last', 'last', _last_compat, numeric_only=False, _convert=True) - _count = _groupby_function('_count', 'count', - lambda x, axis=0: notnull(x).sum(axis=axis), + _count = _groupby_function('_count', 'count', lambda x, axis=0: x.size(), numeric_only=False) def count(self, axis=0): diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 53754a899adf8..b432ddd03d17f 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -2219,18 +2219,21 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, #------------------------------------------------------------------------- # Generators -def generate_put_template(template, use_ints = True, use_floats = True, - use_objects=False): +def generate_put_template(template, use_ints=True, use_floats=True, + use_objects=False, use_datelikes=False): floats_list = [ ('float64', 'float64_t', 'float64_t', 'np.float64'), ('float32', 'float32_t', 'float32_t', 'np.float32'), - ] + ] ints_list = [ ('int8', 'int8_t', 'float32_t', 'np.float32'), ('int16', 'int16_t', 'float32_t', 'np.float32'), ('int32', 'int32_t', 'float64_t', 'np.float64'), ('int64', 'int64_t', 'float64_t', 'np.float64'), - ] + ] + date_like_list = [ + ('int64', 'int64_t', 'float64_t', 'np.float64'), + ] object_list = [('object', 'object', 'float64_t', 'np.float64')] function_list = [] if use_floats: @@ -2239,14 +2242,16 @@ def generate_put_template(template, use_ints = True, use_floats = True, function_list.extend(ints_list) if use_objects: function_list.extend(object_list) + if use_datelikes: + function_list.extend(date_like_list) output = StringIO() for name, c_type, dest_type, dest_dtype in function_list: - func = template % {'name' : name, - 'c_type' : c_type, - 'dest_type' : dest_type.replace('_t', ''), - 'dest_type2' : dest_type, - 'dest_dtype' : dest_dtype} + func = template % {'name': name, + 'c_type': c_type, + 'dest_type': dest_type.replace('_t', ''), + 'dest_type2': dest_type, + 'dest_dtype': dest_dtype} output.write(func) return output.getvalue() @@ -2372,7 +2377,9 @@ def generate_take_cython_file(path='generated.pyx'): print(generate_put_template(template, use_ints=False), file=f) for template in groupby_count: - print(generate_put_template(template, use_objects=True), file=f) + print(generate_put_template(template, use_ints=False, + use_datelikes=True, use_objects=True), + file=f) # for template in templates_1d_datetime: # print >> f, generate_from_template_datetime(template) diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 26c6f3daf0e0a..42ae043847ba1 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -6697,81 +6697,9 @@ def group_count_float32(ndarray[float32_t, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_count_int8(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int8_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab - Py_ssize_t N = values.shape[0], K = values.shape[1] - int8_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(len(counts)): - for j in range(K): - out[i, j] = nobs[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_int16(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int16_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab - Py_ssize_t N = values.shape[0], K = values.shape[1] - int16_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(len(counts)): - for j in range(K): - out[i, j] = nobs[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_int32(ndarray[float64_t, ndim=2] out, +def group_count_object(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[int32_t, ndim=2] values, + ndarray[object, ndim=2] values, ndarray[int64_t] labels): ''' Only aggregates on axis=0 @@ -6779,7 +6707,7 @@ def group_count_int32(ndarray[float64_t, ndim=2] out, cdef: Py_ssize_t i, j, lab Py_ssize_t N = values.shape[0], K = values.shape[1] - int32_t val + object val ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), dtype=np.int64) @@ -6839,42 +6767,6 @@ def group_count_int64(ndarray[float64_t, ndim=2] out, out[i, j] = nobs[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_object(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab - Py_ssize_t N = values.shape[0], K = values.shape[1] - object val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(len(counts)): - for j in range(K): - out[i, j] = nobs[i, j] - - @cython.boundscheck(False) @cython.wraparound(False) @@ -6946,77 +6838,9 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_count_bin_int8(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int8_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, ngroups - Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 - int8_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - ngroups = len(bins) + (bins[len(bins) - 1] != N) - - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[b, j] += val == val and val != iNaT - - for i in range(ngroups): - for j in range(K): - out[i, j] = nobs[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_bin_int16(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int16_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, ngroups - Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 - int16_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - ngroups = len(bins) + (bins[len(bins) - 1] != N) - - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[b, j] += val == val and val != iNaT - - for i in range(ngroups): - for j in range(K): - out[i, j] = nobs[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_bin_int32(ndarray[float64_t, ndim=2] out, +def group_count_bin_object(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[int32_t, ndim=2] values, + ndarray[object, ndim=2] values, ndarray[int64_t] bins): ''' Only aggregates on axis=0 @@ -7024,7 +6848,7 @@ def group_count_bin_int32(ndarray[float64_t, ndim=2] out, cdef: Py_ssize_t i, j, ngroups Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 - int32_t val + object val ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), dtype=np.int64) @@ -7080,40 +6904,6 @@ def group_count_bin_int64(ndarray[float64_t, ndim=2] out, out[i, j] = nobs[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_bin_object(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, ngroups - Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 - object val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - ngroups = len(bins) + (bins[len(bins) - 1] != N) - - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[b, j] += val == val and val != iNaT - - for i in range(ngroups): - for j in range(K): - out[i, j] = nobs[i, j] - - @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index eb3c28b672fd4..107bc46da49fa 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4202,6 +4202,19 @@ def test_datetime_count(self): name='dates') tm.assert_series_equal(result, expected) + def test_lower_int_prec_count(self): + df = DataFrame({'a': np.array([0, 1, 2, 100], np.int8), + 'b': np.array([1, 2, 3, 6], np.uint32), + 'c': np.array([4, 5, 6, 8], np.int16), + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2], + 'b': [2, 2], + 'c': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 638862ffd1367..6f2132ff9b154 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -133,7 +133,7 @@ def f(): value2 = np.random.randn(n) value2[np.random.rand(n) > 0.5] = np.nan -obj = pd.util.testing.choice(['a', 'b'], size=n).astype(object) +obj = tm.choice(list('ab'), size=n).astype(object) obj[np.random.randn(n) > 0.5] = np.nan df = DataFrame({'key1': np.random.randint(0, 500, size=n), @@ -141,6 +141,7 @@ def f(): 'dates': dates, 'value2' : value2, 'value3' : np.random.randn(n), + 'ints': np.random.randint(0, 1000, size=n), 'obj': obj, 'offsets': offsets}) """ @@ -148,6 +149,19 @@ def f(): groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()", setup, name='groupby_multi_count', start_date=datetime(2014, 5, 5)) + +setup = common_setup + """ +n = 10000 + +df = DataFrame({'key1': randint(0, 500, size=n), + 'key2': randint(0, 100, size=n), + 'ints': randint(0, 1000, size=n), + 'ints2': randint(0, 1000, size=n)}) +""" + +groupby_int_count = Benchmark("df.groupby(['key1', 'key2']).count()", + setup, name='groupby_int_count', + start_date=datetime(2014, 5, 6)) #---------------------------------------------------------------------- # Series.value_counts