add failing test to check row order preservation

Nico Cernek · Marco Gorelli · commit f81c4ee66522 · 2020-02-13T11:43:50.000Z
correct the imports broken commit with a bunch of print statements and comments add test for left merge swap left and right keys when how == "right" correct old test: right-merge row order is now the same as the right df clean up spacing and delete temp code add whatsnew replace .from_records with default constructor add GH issue # to tests revert commit ed54bec change logic to swap left and right if how==right clean formatting rename vars and add comment for clarity combine tests into one update whatsnew Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: William Ayd <william.ayd@icloud.com> add before and after examples linting cleanup changes requested by jreback update docs
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -1244,8 +1244,13 @@ Reshaping
 - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`)
 - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`)
 - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`)
+<<<<<<< HEAD
 - Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`)
 - Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`)
+- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
+=======
+>>>>>>> 2b1b67592... changes requested by jreback
+-
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -567,10 +567,10 @@ def __init__(
         indicator: bool = False,
         validate=None,
     ):
-        _left = _validate_operand(left)
-        _right = _validate_operand(right)
-        self.left = self.orig_left = _left
-        self.right = self.orig_right = _right
+        left = validate_operand(left)
+        right = validate_operand(right)
+        self.left = self.orig_left = left
+        self.right = self.orig_right = right
         self.how = how
         self.axis = axis
 
@@ -1292,6 +1292,9 @@ def _get_join_indexers(
         right_keys
     ), "left_key and right_keys must be the same length"
 
+    # bind `sort` arg. of _factorize_keys
+    fkeys = partial(_factorize_keys, sort=sort)
+
     # get left & right join labels and num. of levels at each location
     mapped = (
         _factorize_keys(left_keys[n], right_keys[n], sort=sort)
@@ -1306,15 +1309,20 @@ def _get_join_indexers(
     # factorize keys to a dense i8 space
     # `count` is the num. of unique keys
     # set(lkey) | set(rkey) == range(count)
-    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
 
+    # flip left and right keys if performing a right merge
+    # to preserve right merge row order (GH 27453)
+    if how == "right":
+        factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey)
+    else:
+        factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey)
     # preserve left frame order if how == 'left' and sort == False
     kwargs = copy.copy(kwargs)
     if how == "left":
         kwargs["sort"] = sort
     join_func = _join_functions[how]
 
-    return join_func(lkey, rkey, count, **kwargs)
+    return join_func(factorized_lkey, factorized_rkey, count, **kwargs)
 
 
 def _restore_dropped_levels_multijoin(
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
         # GH 24212
         # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
         # -1 is interpreted as a missing value instead of the last element
-        df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
-        df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
+        df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
+        df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
         result = df1.merge(df2, left_on="key", right_index=True, how=how)
         expected = pd.DataFrame(
             [
-                [1.0, 0, 1],
-                [2.0, 2, 3],
-                [3.0, 2, 3],
-                [np.nan, 1, 2],
-                [np.nan, 3, 4],
-                [np.nan, 4, 5],
+                [0, 0, 0],
+                [1, 1, 1],
+                [2, 2, 2],
+                [np.nan, 3, 3],
+                [np.nan, 4, 4],
+                [np.nan, 5, 5],
             ],
             columns=["a", "key", "b"],
         )
@@ -2167,3 +2167,63 @@ def test_merge_datetime_upcast_dtype():
         }
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["left", "right"])
+def test_merge_preserves_row_order(how):
+    # GH 27453
+    population = [
+        ("Jenn", "Jamaica", 3),
+        ("Beth", "Bulgaria", 7),
+        ("Carl", "Canada", 30),
+    ]
+    columns = ["name", "country", "population"]
+    population_df = DataFrame(population, columns=columns)
+
+    people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")]
+    columns = ["name", "country"]
+    people_df = DataFrame(people, columns=columns)
+
+    expected_data = [
+        ("Abe", "America", np.nan),
+        ("Beth", "Bulgaria", 7),
+        ("Carl", "Canada", 30),
+    ]
+    expected_cols = ["name", "country", "population"]
+    expected = DataFrame(expected_data, columns=expected_cols)
+
+    result = pop.merge(ppl, on=("name", "country"), how="right")
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_left_merge_preserves_row_order():
+    # GH 27453
+    population = [
+        ("Jenn", "Jamaica", 3),
+        ("Beth", "Bulgaria", 7),
+        ("Carl", "Canada", 30),
+    ]
+    columns = ["name", "country", "population"]
+    pop = DataFrame(population, columns=columns)
+
+    people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")]
+    columns = ["name", "country"]
+    ppl = DataFrame(people, columns=columns)
+
+    expected_data = [
+        ("Abe", "America", np.nan),
+        ("Beth", "Bulgaria", 7),
+        ("Carl", "Canada", 30),
+    ]
+    expected_cols = ["name", "country", "population"]
+    expected = DataFrame(expected_data, columns=expected_cols)
+
+    result = ppl.merge(pop, on=("name", "country"), how="left")
+    if how == "right":
+        left_df, right_df = population_df, people_df
+    elif how == "left":
+        left_df, right_df = people_df, population_df
+
+    result = left_df.merge(right_df, on=("name", "country"), how=how)
+    tm.assert_frame_equal(expected, result)