diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b40a64420a0be..1cb1f745fb61b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -333,6 +333,7 @@ Numeric - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) +- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) - Conversion diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bafc37d478fdb..ffe8e794a03ea 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4934,6 +4934,10 @@ def sample( numpy.random.choice: Generates a random sample from a given 1-D numpy array. + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], @@ -4964,6 +4968,20 @@ def sample( dog 4 0 2 fish 0 0 8 + An upsample sample of the ``DataFrame`` with replacement: + Note that `replace` parameter has to be `True` for `frac` parameter > 1. + + >>> df.sample(frac=2, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + falcon 2 2 10 + falcon 2 2 10 + fish 0 0 8 + dog 4 0 2 + fish 0 0 8 + dog 4 0 2 + Using a DataFrame column as weights. Rows with larger value in the `num_specimen_seen` column are more likely to be sampled. @@ -5039,6 +5057,11 @@ def sample( # If no frac or n, default to n=1. if n is None and frac is None: n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) elif n is not None and frac is None and n % 1 != 0: raise ValueError("Only integers accepted as `n` values") elif n is None and frac is not None: diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index a7506f3d60b3c..c180511e31619 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -322,6 +322,7 @@ def test_sample(self): self._compare( o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed) ) + self._compare( o.sample(frac=0.7, random_state=seed), o.sample(frac=0.7, random_state=seed), @@ -337,6 +338,15 @@ def test_sample(self): o.sample(frac=0.7, random_state=np.random.RandomState(test)), ) + self._compare( + o.sample( + frac=2, replace=True, random_state=np.random.RandomState(test) + ), + o.sample( + frac=2, replace=True, random_state=np.random.RandomState(test) + ), + ) + os1, os2 = [], [] for _ in range(2): np.random.seed(test) @@ -424,6 +434,17 @@ def test_sample(self): weights_with_None[5] = 0.5 self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + def test_sample_upsampling_without_replacement(self): + # GH27451 + + df = pd.DataFrame({"A": list("abc")}) + msg = ( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + with pytest.raises(ValueError, match=msg): + df.sample(frac=2, replace=False) + def test_size_compat(self): # GH8846 # size property should be defined