From 7d789249de675505c88216bf86026b6a4a3fdc4c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 2 Nov 2022 14:39:02 +0000 Subject: [PATCH 1/3] use default_index more --- pandas/core/frame.py | 2 +- pandas/core/groupby/generic.py | 5 +++-- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/encoding.py | 7 +++++-- pandas/core/reshape/merge.py | 3 ++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 044c40c58b85c..75f51051761f8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4012,7 +4012,7 @@ def igetitem(obj, i: int): # Using self.iloc[:, i] = ... may set values inplace, which # by convention we do not do in __setitem__ try: - self.columns = Index(range(len(self.columns))) + self.columns = default_index(len(self.columns)) for i, iloc in enumerate(ilocs): self[iloc] = igetitem(value, i) finally: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cea9aaf70ccd0..e4e20ef98224c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -95,6 +95,7 @@ Index, MultiIndex, all_indexes_same, + default_index, ) from pandas.core.indexes.category import CategoricalIndex from pandas.core.series import Series @@ -1159,7 +1160,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) - result.index = Index(range(len(result))) + result.index = default_index(len(result)) return result @@ -1778,7 +1779,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: ) if not self.as_index: - results.index = Index(range(len(results))) + results.index = default_index(len(results)) self._insert_inaxis_grouper_inplace(results) return results diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 28eab57ac7bde..356621a47c5d5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -311,7 +311,7 @@ def ndarray_to_mgr( values = [values] if columns is None: - columns = Index(range(len(values))) + columns = default_index(len(values)) else: columns = ensure_index(columns) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index a39e3c1f10956..ec077caeef69e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -21,7 +21,10 @@ from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.frame import DataFrame -from pandas.core.indexes.api import Index +from pandas.core.indexes.api import ( + Index, + default_index, +) from pandas.core.series import Series @@ -249,7 +252,7 @@ def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: - index = Index(range(len(data))) + index = default_index(len(data)) return DataFrame(index=index) # if all NaN diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f4332f2c7eb1b..2de399d8648b8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -84,6 +84,7 @@ import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc +from pandas.core.indexes.api import default_index from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -1060,7 +1061,7 @@ def _get_join_info( else: join_index = self.left.index.take(left_indexer) else: - join_index = Index(np.arange(len(left_indexer))) + join_index = default_index(len(left_indexer)) if len(join_index) == 0: join_index = join_index.astype(object) From c944f27ceb023451459188667e73f73015127aab Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 2 Nov 2022 15:13:00 +0000 Subject: [PATCH 2/3] dont change for columns --- pandas/core/frame.py | 2 +- pandas/core/internals/construction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 75f51051761f8..044c40c58b85c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4012,7 +4012,7 @@ def igetitem(obj, i: int): # Using self.iloc[:, i] = ... may set values inplace, which # by convention we do not do in __setitem__ try: - self.columns = default_index(len(self.columns)) + self.columns = Index(range(len(self.columns))) for i, iloc in enumerate(ilocs): self[iloc] = igetitem(value, i) finally: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 356621a47c5d5..28eab57ac7bde 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -311,7 +311,7 @@ def ndarray_to_mgr( values = [values] if columns is None: - columns = default_index(len(values)) + columns = Index(range(len(values))) else: columns = ensure_index(columns) From 6e6149afe89d5b5d88d76c2c300c1f56af8c833b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 3 Nov 2022 16:14:44 +0000 Subject: [PATCH 3/3] :memo: add whatsnew note --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 47b7de04d7f95..f3396e5224222 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -336,6 +336,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: