You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ka...@apache.org on 2021/08/27 03:40:30 UTC
[spark] branch branch-3.2 updated: [SPARK-36595][SQL][SS][DOCS] Document window & session_window function in SQL API doc

This is an automated email from the ASF dual-hosted git repository.

kabhwan pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new 118a53d  [SPARK-36595][SQL][SS][DOCS] Document window & session_window function in SQL API doc
118a53d is described below

commit 118a53d87f9a809c6c47c061dfa20469405d4f69
Author: Jungtaek Lim <ka...@gmail.com>
AuthorDate: Fri Aug 27 12:39:09 2021 +0900

    [SPARK-36595][SQL][SS][DOCS] Document window & session_window function in SQL API doc
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to document `window` & `session_window` function in SQL API doc page.
    
    Screenshot of functions:
    
    > window
    
    ![스크린샷 2021-08-26 오후 6 34 58](https://user-images.githubusercontent.com/1317309/130939754-0ea1b55e-39d4-4205-b79d-a9508c98921c.png)
    
    > session_window
    
    ![스크린샷 2021-08-26 오후 6 35 19](https://user-images.githubusercontent.com/1317309/130939773-b6cb4b98-88f8-4d57-a188-ee40ed7b2b08.png)
    
    ### Why are the changes needed?
    
    Description is missing in both `window` / `session_window` functions for SQL API page.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the description of `window` / `session_window` functions will be available in SQL API page.
    
    ### How was this patch tested?
    
    Only doc changes.
    
    Closes #33846 from HeartSaVioR/SPARK-36595.
    
    Authored-by: Jungtaek Lim <ka...@gmail.com>
    Signed-off-by: Jungtaek Lim <ka...@gmail.com>
    (cherry picked from commit bc32144a91c0f3b9f9242795dd0f777fb01d57d9)
    Signed-off-by: Jungtaek Lim <ka...@gmail.com>
---
 .../sql/catalyst/expressions/SessionWindow.scala   | 28 ++++++++++++++++
 .../sql/catalyst/expressions/TimeWindow.scala      | 37 ++++++++++++++++++++++
 .../sql-functions/sql-expression-schema.md         |  8 ++---
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala |  3 +-
 .../sql/expressions/ExpressionInfoSuite.scala      |  3 --
 5 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala
index eb46c0f..796ea27 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala
@@ -35,6 +35,34 @@ import org.apache.spark.unsafe.types.UTF8String
  *                    duration during the query execution. Note that the rows with negative or
  *                    zero gap duration will be filtered out from the aggregation.
  */
+// scalastyle:off line.size.limit line.contains.tab
+@ExpressionDescription(
+  usage = """
+    _FUNC_(time_column, gap_duration) - Generates session window given a timestamp specifying column and gap duration.
+      See <a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#types-of-time-windows">'Types of time windows'</a> in Structured Streaming guide doc for detailed explanation and examples.
+  """,
+  arguments = """
+    Arguments:
+      * time_column - The column or the expression to use as the timestamp for windowing by time. The time column must be of TimestampType.
+      * gap_duration - A string specifying the timeout of the session represented as "interval value"
+        (See <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#interval-literal">Interval Literal</a> for more details.) for the fixed gap duration, or
+        an expression which is applied for each input and evaluated to the "interval value" for the dynamic gap duration.
+  """,
+  examples = """
+    Examples:
+      > SELECT a, session_window.start, session_window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:10:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, _FUNC_(b, '5 minutes') ORDER BY a, start;
+        A1	2021-01-01 00:00:00	2021-01-01 00:09:30	2
+        A1	2021-01-01 00:10:00	2021-01-01 00:15:00	1
+        A2	2021-01-01 00:01:00	2021-01-01 00:06:00	1
+      > SELECT a, session_window.start, session_window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:10:00'), ('A2', '2021-01-01 00:01:00'), ('A2', '2021-01-01 00:04:30') AS tab(a, b) GROUP by a, _FUNC_(b, CASE WHEN a = 'A1' THEN '5 minutes' WHEN a = 'A2' THEN '1 minute' ELSE '10 minutes' END) ORDER BY a, start;
+        A1	2021-01-01 00:00:00	2021-01-01 00:09:30	2
+        A1	2021-01-01 00:10:00	2021-01-01 00:15:00	1
+        A2	2021-01-01 00:01:00	2021-01-01 00:02:00	1
+        A2	2021-01-01 00:04:30	2021-01-01 00:05:30	1
+  """,
+  group = "datetime_funcs",
+  since = "3.2.0")
+// scalastyle:on line.size.limit line.contains.tab
 case class SessionWindow(timeColumn: Expression, gapDuration: Expression) extends Expression
   with ImplicitCastInputTypes
   with Unevaluable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala
index 2f08fd7..d7deca2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala
@@ -27,6 +27,43 @@ import org.apache.spark.sql.catalyst.util.IntervalUtils
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.types._
 
+// scalastyle:off line.size.limit line.contains.tab
+@ExpressionDescription(
+  usage = """
+    _FUNC_(time_column, window_duration[, slide_duration[, start_time]]) - Bucketize rows into one or more time windows given a timestamp specifying column.
+      Window starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window [12:05,12:10) but not in [12:00,12:05).
+      Windows can support microsecond precision. Windows in the order of months are not supported.
+      See <a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#window-operations-on-event-time">'Window Operations on Event Time'</a> in Structured Streaming guide doc for detailed explanation and examples.
+  """,
+  arguments = """
+    Arguments:
+      * time_column - The column or the expression to use as the timestamp for windowing by time. The time column must be of TimestampType.
+      * window_duration - A string specifying the width of the window represented as "interval value".
+        (See <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#interval-literal">Interval Literal</a> for more details.)
+        Note that the duration is a fixed length of time, and does not vary over time according to a calendar.
+      * slide_duration - A string specifying the sliding interval of the window represented as "interval value".
+        A new window will be generated every `slide_duration`. Must be less than or equal to the `window_duration`.
+        This duration is likewise absolute, and does not vary according to a calendar.
+      * start_time - The offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals.
+        For example, in order to have hourly tumbling windows that start 15 minutes past the hour,
+        e.g. 12:15-13:15, 13:15-14:15... provide `start_time` as `15 minutes`.
+  """,
+  examples = """
+    Examples:
+      > SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, _FUNC_(b, '5 minutes') ORDER BY a, start;
+        A1	2021-01-01 00:00:00	2021-01-01 00:05:00	2
+        A1	2021-01-01 00:05:00	2021-01-01 00:10:00	1
+        A2	2021-01-01 00:00:00	2021-01-01 00:05:00	1
+      > SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, _FUNC_(b, '10 minutes', '5 minutes') ORDER BY a, start;
+        A1	2020-12-31 23:55:00	2021-01-01 00:05:00	2
+        A1	2021-01-01 00:00:00	2021-01-01 00:10:00	3
+        A1	2021-01-01 00:05:00	2021-01-01 00:15:00	1
+        A2	2020-12-31 23:55:00	2021-01-01 00:05:00	1
+        A2	2021-01-01 00:00:00	2021-01-01 00:10:00	1
+  """,
+  group = "datetime_funcs",
+  since = "2.0.0")
+// scalastyle:on line.size.limit line.contains.tab
 case class TimeWindow(
     timeColumn: Expression,
     windowDuration: Long,
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 6eafb38..26fcdb3 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -1,8 +1,8 @@
 <!-- Automatically generated by ExpressionsSchemaSuite -->
 ## Summary
   - Number of queries: 361
-  - Number of expressions that missing example: 14
-  - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,session_window,window
+  - Number of expressions that missing example: 12
+  - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint
 ## Schema of Built-in Functions
 | Class name | Function name or alias | Query example | Output schema |
 | ---------- | ---------------------- | ------------- | ------------- |
@@ -244,7 +244,7 @@
 | org.apache.spark.sql.catalyst.expressions.SecondsToTimestamp | timestamp_seconds | SELECT timestamp_seconds(1230219000) | struct<timestamp_seconds(1230219000):timestamp> |
 | org.apache.spark.sql.catalyst.expressions.Sentences | sentences | SELECT sentences('Hi there! Good morning.') | struct<sentences(Hi there! Good morning., , ):array<array<string>>> |
 | org.apache.spark.sql.catalyst.expressions.Sequence | sequence | SELECT sequence(1, 5) | struct<sequence(1, 5):array<int>> |
-| org.apache.spark.sql.catalyst.expressions.SessionWindow | session_window | N/A | N/A |
+| org.apache.spark.sql.catalyst.expressions.SessionWindow | session_window | SELECT a, session_window.start, session_window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:10:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, session_window(b, '5 minutes') ORDER BY a, start | struct<a:string,start:timestamp,end:timestamp,cnt:bigint> |
 | org.apache.spark.sql.catalyst.expressions.Sha1 | sha | SELECT sha('Spark') | struct<sha(Spark):string> |
 | org.apache.spark.sql.catalyst.expressions.Sha1 | sha1 | SELECT sha1('Spark') | struct<sha1(Spark):string> |
 | org.apache.spark.sql.catalyst.expressions.Sha2 | sha2 | SELECT sha2('Spark', 256) | struct<sha2(Spark, 256):string> |
@@ -288,7 +288,7 @@
 | org.apache.spark.sql.catalyst.expressions.Subtract | - | SELECT 2 - 1 | struct<(2 - 1):int> |
 | org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct<TAN(0):double> |
 | org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct<TANH(0):double> |
-| org.apache.spark.sql.catalyst.expressions.TimeWindow | window | N/A | N/A |
+| org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct<a:string,start:timestamp,end:timestamp,cnt:bigint> |
 | org.apache.spark.sql.catalyst.expressions.ToDegrees | degrees | SELECT degrees(3.141592653589793) | struct<DEGREES(3.141592653589793):double> |
 | org.apache.spark.sql.catalyst.expressions.ToRadians | radians | SELECT radians(180) | struct<RADIANS(180):double> |
 | org.apache.spark.sql.catalyst.expressions.ToUTCTimestamp | to_utc_timestamp | SELECT to_utc_timestamp('2016-08-31', 'Asia/Seoul') | struct<to_utc_timestamp(2016-08-31, Asia/Seoul):timestamp> |
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 032ddbb..7bbbaad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -136,8 +136,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
 
   test("SPARK-14415: All functions should have own descriptions") {
     for (f <- spark.sessionState.functionRegistry.listFunction()) {
-      if (!Seq("cube", "grouping", "grouping_id", "rollup", "window",
-          "session_window").contains(f.unquotedString)) {
+      if (!Seq("cube", "grouping", "grouping_id", "rollup").contains(f.unquotedString)) {
         checkKeywordsNotExist(sql(s"describe function $f"), "N/A.")
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
index 08e21d5..11c06d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
@@ -131,9 +131,6 @@ class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession {
   test("SPARK-32870: Default expressions in FunctionRegistry should have their " +
     "usage, examples, since, and group filled") {
     val ignoreSet = Set(
-      // Explicitly inherits NonSQLExpression, and has no ExpressionDescription
-      "org.apache.spark.sql.catalyst.expressions.TimeWindow",
-      "org.apache.spark.sql.catalyst.expressions.SessionWindow",
       // Cast aliases do not need examples
       "org.apache.spark.sql.catalyst.expressions.Cast")
 

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org