Skip to content

ENH: use size instead of cythonized count for fallback cases #7055

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 8, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,8 +722,7 @@ def size(self):
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
_convert=True)

_count = _groupby_function('_count', 'count',
lambda x, axis=0: notnull(x).sum(axis=axis),
_count = _groupby_function('_count', 'count', lambda x, axis=0: x.size(),
numeric_only=False)

def count(self, axis=0):
Expand Down
27 changes: 17 additions & 10 deletions pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -2219,18 +2219,21 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
#-------------------------------------------------------------------------
# Generators

def generate_put_template(template, use_ints = True, use_floats = True,
use_objects=False):
def generate_put_template(template, use_ints=True, use_floats=True,
use_objects=False, use_datelikes=False):
floats_list = [
('float64', 'float64_t', 'float64_t', 'np.float64'),
('float32', 'float32_t', 'float32_t', 'np.float32'),
]
]
ints_list = [
('int8', 'int8_t', 'float32_t', 'np.float32'),
('int16', 'int16_t', 'float32_t', 'np.float32'),
('int32', 'int32_t', 'float64_t', 'np.float64'),
('int64', 'int64_t', 'float64_t', 'np.float64'),
]
]
date_like_list = [
('int64', 'int64_t', 'float64_t', 'np.float64'),
]
object_list = [('object', 'object', 'float64_t', 'np.float64')]
function_list = []
if use_floats:
Expand All @@ -2239,14 +2242,16 @@ def generate_put_template(template, use_ints = True, use_floats = True,
function_list.extend(ints_list)
if use_objects:
function_list.extend(object_list)
if use_datelikes:
function_list.extend(date_like_list)

output = StringIO()
for name, c_type, dest_type, dest_dtype in function_list:
func = template % {'name' : name,
'c_type' : c_type,
'dest_type' : dest_type.replace('_t', ''),
'dest_type2' : dest_type,
'dest_dtype' : dest_dtype}
func = template % {'name': name,
'c_type': c_type,
'dest_type': dest_type.replace('_t', ''),
'dest_type2': dest_type,
'dest_dtype': dest_dtype}
output.write(func)
return output.getvalue()

Expand Down Expand Up @@ -2372,7 +2377,9 @@ def generate_take_cython_file(path='generated.pyx'):
print(generate_put_template(template, use_ints=False), file=f)

for template in groupby_count:
print(generate_put_template(template, use_objects=True), file=f)
print(generate_put_template(template, use_ints=False,
use_datelikes=True, use_objects=True),
file=f)

# for template in templates_1d_datetime:
# print >> f, generate_from_template_datetime(template)
Expand Down
222 changes: 6 additions & 216 deletions pandas/src/generated.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6697,89 +6697,17 @@ def group_count_float32(ndarray[float32_t, ndim=2] out,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_int8(ndarray[float32_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int8_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab
Py_ssize_t N = values.shape[0], K = values.shape[1]
int8_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")

for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(len(counts)):
for j in range(K):
out[i, j] = nobs[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_int16(ndarray[float32_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int16_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab
Py_ssize_t N = values.shape[0], K = values.shape[1]
int16_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")

for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(len(counts)):
for j in range(K):
out[i, j] = nobs[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_int32(ndarray[float64_t, ndim=2] out,
def group_count_object(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int32_t, ndim=2] values,
ndarray[object, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab
Py_ssize_t N = values.shape[0], K = values.shape[1]
int32_t val
object val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

Expand Down Expand Up @@ -6839,42 +6767,6 @@ def group_count_int64(ndarray[float64_t, ndim=2] out,
out[i, j] = nobs[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_object(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[object, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab
Py_ssize_t N = values.shape[0], K = values.shape[1]
object val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")

for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(len(counts)):
for j in range(K):
out[i, j] = nobs[i, j]



@cython.boundscheck(False)
@cython.wraparound(False)
Expand Down Expand Up @@ -6946,85 +6838,17 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_bin_int8(ndarray[float32_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int8_t, ndim=2] values,
ndarray[int64_t] bins):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, ngroups
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
int8_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

ngroups = len(bins) + (bins[len(bins) - 1] != N)

for i in range(N):
while b < ngroups - 1 and i >= bins[b]:
b += 1

counts[b] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[b, j] += val == val and val != iNaT

for i in range(ngroups):
for j in range(K):
out[i, j] = nobs[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_bin_int16(ndarray[float32_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int16_t, ndim=2] values,
ndarray[int64_t] bins):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, ngroups
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
int16_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

ngroups = len(bins) + (bins[len(bins) - 1] != N)

for i in range(N):
while b < ngroups - 1 and i >= bins[b]:
b += 1

counts[b] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[b, j] += val == val and val != iNaT

for i in range(ngroups):
for j in range(K):
out[i, j] = nobs[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_bin_int32(ndarray[float64_t, ndim=2] out,
def group_count_bin_object(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int32_t, ndim=2] values,
ndarray[object, ndim=2] values,
ndarray[int64_t] bins):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, ngroups
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
int32_t val
object val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

Expand Down Expand Up @@ -7080,40 +6904,6 @@ def group_count_bin_int64(ndarray[float64_t, ndim=2] out,
out[i, j] = nobs[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_bin_object(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[object, ndim=2] values,
ndarray[int64_t] bins):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, ngroups
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
object val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

ngroups = len(bins) + (bins[len(bins) - 1] != N)

for i in range(N):
while b < ngroups - 1 and i >= bins[b]:
b += 1

counts[b] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[b, j] += val == val and val != iNaT

for i in range(ngroups):
for j in range(K):
out[i, j] = nobs[i, j]



@cython.wraparound(False)
@cython.boundscheck(False)
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4202,6 +4202,19 @@ def test_datetime_count(self):
name='dates')
tm.assert_series_equal(result, expected)

def test_lower_int_prec_count(self):
df = DataFrame({'a': np.array([0, 1, 2, 100], np.int8),
'b': np.array([1, 2, 3, 6], np.uint32),
'c': np.array([4, 5, 6, 8], np.int16),
'grp': list('ab' * 2)})
result = df.groupby('grp').count()
expected = DataFrame({'a': [2, 2],
'b': [2, 2],
'c': [2, 2]}, index=pd.Index(list('ab'),
name='grp'))
tm.assert_frame_equal(result, expected)


def assert_fp_equal(a, b):
assert (np.abs(a - b) < 1e-12).all()

Expand Down
16 changes: 15 additions & 1 deletion vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,35 @@ def f():
value2 = np.random.randn(n)
value2[np.random.rand(n) > 0.5] = np.nan

obj = pd.util.testing.choice(['a', 'b'], size=n).astype(object)
obj = tm.choice(list('ab'), size=n).astype(object)
obj[np.random.randn(n) > 0.5] = np.nan

df = DataFrame({'key1': np.random.randint(0, 500, size=n),
'key2': np.random.randint(0, 100, size=n),
'dates': dates,
'value2' : value2,
'value3' : np.random.randn(n),
'ints': np.random.randint(0, 1000, size=n),
'obj': obj,
'offsets': offsets})
"""

groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()",
setup, name='groupby_multi_count',
start_date=datetime(2014, 5, 5))

setup = common_setup + """
n = 10000

df = DataFrame({'key1': randint(0, 500, size=n),
'key2': randint(0, 100, size=n),
'ints': randint(0, 1000, size=n),
'ints2': randint(0, 1000, size=n)})
"""

groupby_int_count = Benchmark("df.groupby(['key1', 'key2']).count()",
setup, name='groupby_int_count',
start_date=datetime(2014, 5, 6))
#----------------------------------------------------------------------
# Series.value_counts

Expand Down