You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ho...@apache.org on 2021/11/12 04:54:23 UTC

[arrow-datafusion] branch master updated: Dataframe supports except and update readme (#1261)

This is an automated email from the ASF dual-hosted git repository.

houqp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 0798ca4  Dataframe supports except and update readme (#1261)
0798ca4 is described below

commit 0798ca496801c13fe40587946d164f9969584e8d
Author: Carlos <wx...@gmail.com>
AuthorDate: Fri Nov 12 12:54:18 2021 +0800

    Dataframe supports except and update readme (#1261)
---
 README.md                                  |  6 +++---
 datafusion/src/dataframe.rs                | 15 +++++++++++++++
 datafusion/src/execution/dataframe_impl.rs | 23 +++++++++++++++++++++++
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7807f26..5ca0802 100644
--- a/README.md
+++ b/README.md
@@ -215,13 +215,13 @@ DataFusion also includes a simple command-line interactive SQL utility. See the
 - [ ] Lists
 - [x] Subqueries
 - [x] Common table expressions
-- [ ] Set Operations
+- [x] Set Operations
   - [x] UNION ALL
   - [x] UNION
   - [x] INTERSECT
   - [x] INTERSECT ALL
-  - [ ] EXCEPT
-  - [ ] EXCEPT ALL
+  - [x] EXCEPT
+  - [x] EXCEPT ALL
 - [x] Joins
   - [x] INNER JOIN
   - [x] LEFT JOIN
diff --git a/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs
index 7f4a3e5..c8c5dcc 100644
--- a/datafusion/src/dataframe.rs
+++ b/datafusion/src/dataframe.rs
@@ -390,4 +390,19 @@ pub trait DataFrame: Send + Sync {
     /// # }
     /// ```
     fn intersect(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>;
+
+    /// Calculate the exception of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let mut ctx = ExecutionContext::new();
+    /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
+    /// let df = df.except(df.clone())?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    fn except(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>;
 }
diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs
index c202e19..f565f5c 100644
--- a/datafusion/src/execution/dataframe_impl.rs
+++ b/datafusion/src/execution/dataframe_impl.rs
@@ -240,6 +240,15 @@ impl DataFrame for DataFrameImpl {
             &LogicalPlanBuilder::intersect(left_plan, right_plan, true)?,
         )))
     }
+
+    fn except(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>> {
+        let left_plan = self.to_logical_plan();
+        let right_plan = dataframe.to_logical_plan();
+        Ok(Arc::new(DataFrameImpl::new(
+            self.ctx_state.clone(),
+            &LogicalPlanBuilder::except(left_plan, right_plan, true)?,
+        )))
+    }
 }
 
 #[cfg(test)]
@@ -461,6 +470,20 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn except() -> Result<()> {
+        let df = test_table().await?.select_columns(&["c1", "c3"])?;
+        let plan = df.except(df.clone())?;
+        let result = plan.to_logical_plan();
+        let expected = create_plan(
+            "SELECT c1, c3 FROM aggregate_test_100
+            EXCEPT ALL SELECT c1, c3 FROM aggregate_test_100",
+        )
+        .await?;
+        assert_same_plan(&result, &expected);
+        Ok(())
+    }
+
     /// Compare the formatted string representation of two plans for equality
     fn assert_same_plan(plan1: &LogicalPlan, plan2: &LogicalPlan) {
         assert_eq!(format!("{:?}", plan1), format!("{:?}", plan2));