You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2023/02/22 00:47:59 UTC

[spark] branch master updated: [SPARK-42514][CONNECT] Scala Client add partition transforms functions

This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 4f3afdcd756 [SPARK-42514][CONNECT] Scala Client add partition transforms functions
4f3afdcd756 is described below

commit 4f3afdcd7561db38f3c0427d31db4f27fa94a83c
Author: yangjie01 <ya...@baidu.com>
AuthorDate: Tue Feb 21 20:47:42 2023 -0400

    [SPARK-42514][CONNECT] Scala Client add partition transforms functions
    
    ### What changes were proposed in this pull request?
    This PR aims add the partition transforms functions to the Scala spark connect client.
    
    ### Why are the changes needed?
    Provide same APIs in the Scala spark connect client as in the original Dataset API.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, it adds new for functions to the Spark Connect Scala client.
    
    ### How was this patch tested?
    
    - Add new test
    - Manual checked `connect-client-jvm` and `connect` with Scala-2.13
    
    Closes #40105 from LuciferYang/partition-transforms-functions.
    
    Authored-by: yangjie01 <ya...@baidu.com>
    Signed-off-by: Herman van Hovell <he...@databricks.com>
---
 .../scala/org/apache/spark/sql/functions.scala     | 58 ++++++++++++++++++++++
 .../org/apache/spark/sql/FunctionTestSuite.scala   |  1 +
 .../apache/spark/sql/PlanGenerationTestSuite.scala | 20 ++++++++
 .../explain-results/function_bucket.explain        |  2 +
 .../explain-results/function_days.explain          |  2 +
 .../explain-results/function_hours.explain         |  2 +
 .../explain-results/function_months.explain        |  2 +
 .../explain-results/function_years.explain         |  2 +
 .../query-tests/queries/function_bucket.json       | 23 +++++++++
 .../query-tests/queries/function_bucket.proto.bin  |  5 ++
 .../query-tests/queries/function_days.json         | 19 +++++++
 .../query-tests/queries/function_days.proto.bin    |  4 ++
 .../query-tests/queries/function_hours.json        | 19 +++++++
 .../query-tests/queries/function_hours.proto.bin   |  4 ++
 .../query-tests/queries/function_months.json       | 19 +++++++
 .../query-tests/queries/function_months.proto.bin  |  4 ++
 .../query-tests/queries/function_years.json        | 19 +++++++
 .../query-tests/queries/function_years.proto.bin   |  4 ++
 18 files changed, 209 insertions(+)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 6dffa8d3ea1..4996b5033e3 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3591,6 +3591,64 @@ object functions {
    */
   def timestamp_seconds(e: Column): Column = Column.fn("timestamp_seconds", e)
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Partition Transforms functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * A transform for timestamps and dates to partition data into years.
+   *
+   * @group partition_transforms
+   * @since 3.4.0
+   */
+  def years(e: Column): Column =
+    Column.fn("years", e)
+
+  /**
+   * A transform for timestamps and dates to partition data into months.
+   *
+   * @group partition_transforms
+   * @since 3.4.0
+   */
+  def months(e: Column): Column =
+    Column.fn("months", e)
+
+  /**
+   * A transform for timestamps and dates to partition data into days.
+   *
+   * @group partition_transforms
+   * @since 3.4.0
+   */
+  def days(e: Column): Column =
+    Column.fn("days", e)
+
+  /**
+   * A transform for timestamps to partition data into hours.
+   *
+   * @group partition_transforms
+   * @since 3.4.0
+   */
+  def hours(e: Column): Column =
+    Column.fn("hours", e)
+
+  /**
+   * A transform for any type that partitions by a hash of the input column.
+   *
+   * @group partition_transforms
+   * @since 3.4.0
+   */
+  def bucket(numBuckets: Column, e: Column): Column =
+    Column.fn("bucket", numBuckets, e)
+
+  /**
+   * A transform for any type that partitions by a hash of the input column.
+   *
+   * @group partition_transforms
+   * @since 3.4.0
+   */
+  def bucket(numBuckets: Int, e: Column): Column =
+    Column.fn("bucket", lit(numBuckets), e)
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Scala UDF functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
index 1e4960ef9b2..d600ac432a2 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
@@ -170,6 +170,7 @@ class FunctionTestSuite extends ConnectFunSuite {
     window(a, "10 seconds", "10 seconds"),
     window(a, "10 seconds"))
   testEquals("session_window", session_window(a, "1 second"), session_window(a, lit("1 second")))
+  testEquals("bucket", bucket(lit(3), a), bucket(3, a))
 
   test("assert_true no message") {
     val e = assert_true(a).expr
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 830d55b6cbf..b9ae66b2b9e 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -1386,6 +1386,26 @@ class PlanGenerationTestSuite extends ConnectFunSuite with BeforeAndAfterAll wit
     fn.upper(fn.col("g"))
   }
 
+  functionTest("years") {
+    fn.years(Column("a"))
+  }
+
+  functionTest("months") {
+    fn.months(Column("a"))
+  }
+
+  functionTest("days") {
+    fn.days(Column("a"))
+  }
+
+  functionTest("hours") {
+    fn.hours(Column("a"))
+  }
+
+  functionTest("bucket") {
+    fn.bucket(3, Column("a"))
+  }
+
   private def temporalFunctionTest(name: String)(f: => Column): Unit = {
     test("function " + name) {
       temporals.select(f)
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain
new file mode 100644
index 00000000000..8ab0c9493ab
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain
@@ -0,0 +1,2 @@
+Project [bucket(3, a#0) AS bucket(a)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain
new file mode 100644
index 00000000000..16ca2fe415e
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain
@@ -0,0 +1,2 @@
+Project [days(a#0) AS days(a)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain
new file mode 100644
index 00000000000..a019836233d
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain
@@ -0,0 +1,2 @@
+Project [hours(a#0) AS hours(a)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain
new file mode 100644
index 00000000000..17b991ec1aa
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain
@@ -0,0 +1,2 @@
+Project [months(a#0) AS months(a)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain
new file mode 100644
index 00000000000..ee2342c4b02
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain
@@ -0,0 +1,2 @@
+Project [years(a#0) AS years(a)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json
new file mode 100644
index 00000000000..4ec5fb5f27b
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json
@@ -0,0 +1,23 @@
+{
+  "project": {
+    "input": {
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "bucket",
+        "arguments": [{
+          "literal": {
+            "integer": 3
+          }
+        }, {
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin
new file mode 100644
index 00000000000..4ccecb3d59c
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin
@@ -0,0 +1,5 @@
+�
+�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string>
+bucket
+0
+a
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_days.json b/connector/connect/common/src/test/resources/query-tests/queries/function_days.json
new file mode 100644
index 00000000000..8fb62b4a2e4
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_days.json
@@ -0,0 +1,19 @@
+{
+  "project": {
+    "input": {
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "days",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin
new file mode 100644
index 00000000000..ecfa97f445c
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin
@@ -0,0 +1,4 @@
+�
+�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string>
+days
+a
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json
new file mode 100644
index 00000000000..4cccc0eef47
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json
@@ -0,0 +1,19 @@
+{
+  "project": {
+    "input": {
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "hours",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin
new file mode 100644
index 00000000000..17cb707448a
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin
@@ -0,0 +1,4 @@
+�
+�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string>
+hours
+a
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months.json b/connector/connect/common/src/test/resources/query-tests/queries/function_months.json
new file mode 100644
index 00000000000..189def244d2
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months.json
@@ -0,0 +1,19 @@
+{
+  "project": {
+    "input": {
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "months",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin
new file mode 100644
index 00000000000..9f6ea9641f6
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin
@@ -0,0 +1,4 @@
+�
+�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string>
+months
+a
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_years.json b/connector/connect/common/src/test/resources/query-tests/queries/function_years.json
new file mode 100644
index 00000000000..a0b6f4228d0
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_years.json
@@ -0,0 +1,19 @@
+{
+  "project": {
+    "input": {
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "years",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin
new file mode 100644
index 00000000000..f1e2a949fb4
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin
@@ -0,0 +1,4 @@
+�
+�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string>
+years
+a
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org