diff --git a/.gitignore b/.gitignore index b57efb74e..dba642930 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ target Cargo.lock +/venv .idea # Byte-compiled / optimized / DLL files diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py index bed0a91a6..30506a7c6 100644 --- a/datafusion/tests/test_dataframe.py +++ b/datafusion/tests/test_dataframe.py @@ -156,6 +156,30 @@ def test_join(): assert table.to_pydict() == expected +def test_distinct(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3, 1, 2, 3]), pa.array([4, 5, 6, 4, 5, 6])], + names=["a", "b"], + ) + df_a = ( + ctx.create_dataframe([[batch]]) + .distinct() + .sort(column("a").sort(ascending=True)) + ) + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df_b = ctx.create_dataframe([[batch]]).sort( + column("a").sort(ascending=True) + ) + + assert df_a.collect() == df_b.collect() + + def test_window_lead(df): df = df.select( column("a"), diff --git a/src/dataframe.rs b/src/dataframe.rs index 80963f7f0..87f38c170 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -129,6 +129,12 @@ impl PyDataFrame { Ok(pretty::print_batches(&batches)?) } + /// Filter out duplicate rows + fn distinct(&self) -> PyResult { + let df = self.df.distinct()?; + Ok(Self::new(df)) + } + fn join( &self, right: PyDataFrame, @@ -147,7 +153,7 @@ impl PyDataFrame { "The join type {} does not exist or is not implemented", how )) - .into()) + .into()); } };