You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by fe...@apache.org on 2018/01/21 19:23:57 UTC

spark git commit: [SPARK-21293][SS][SPARKR] Add doc example for streaming join, dedup

Repository: spark
Updated Branches:
  refs/heads/master 4f43d27c9 -> 2239d7a41


[SPARK-21293][SS][SPARKR] Add doc example for streaming join, dedup

## What changes were proposed in this pull request?

streaming programming guide changes

## How was this patch tested?

manually

Author: Felix Cheung <fe...@hotmail.com>

Closes #20340 from felixcheung/rstreamdoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2239d7a4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2239d7a4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2239d7a4

Branch: refs/heads/master
Commit: 2239d7a410e906ccd40aa8e84d637e9d06cd7b8a
Parents: 4f43d27
Author: Felix Cheung <fe...@hotmail.com>
Authored: Sun Jan 21 11:23:51 2018 -0800
Committer: Felix Cheung <fe...@apache.org>
Committed: Sun Jan 21 11:23:51 2018 -0800

----------------------------------------------------------------------
 docs/structured-streaming-programming-guide.md | 74 ++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2239d7a4/docs/structured-streaming-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 2ef5d31..62589a6 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1101,6 +1101,21 @@ streamingDf.join(staticDf, "type", "right_join")  # right outer join with a stat
 {% endhighlight %}
 
 </div>
+
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+staticDf <- read.df(...)
+streamingDf <- read.stream(...)
+joined <- merge(streamingDf, staticDf, sort = FALSE)  # inner equi-join with a static DF
+joined <- join(
+            staticDf,
+            streamingDf, 
+            streamingDf$value == staticDf$value,
+            "right_outer")  # right outer join with a static DF
+{% endhighlight %}
+
+</div>
 </div>
 
 Note that stream-static joins are not stateful, so no state management is necessary.
@@ -1228,6 +1243,30 @@ impressionsWithWatermark.join(
 {% endhighlight %}
 
 </div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+impressions <- read.stream(...)
+clicks <- read.stream(...)
+
+# Apply watermarks on event-time columns
+impressionsWithWatermark <- withWatermark(impressions, "impressionTime", "2 hours")
+clicksWithWatermark <- withWatermark(clicks, "clickTime", "3 hours")
+
+# Join with event-time constraints
+joined <- join(
+  impressionsWithWatermark,
+  clicksWithWatermark,
+  expr(
+    paste(
+      "clickAdId = impressionAdId AND",
+      "clickTime >= impressionTime AND",
+      "clickTime <= impressionTime + interval 1 hour"
+)))
+
+{% endhighlight %}
+
+</div>
 </div>
 
 ##### Outer Joins with Watermarking
@@ -1288,6 +1327,23 @@ impressionsWithWatermark.join(
 {% endhighlight %}
 
 </div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+joined <- join(
+  impressionsWithWatermark,
+  clicksWithWatermark,
+  expr(
+    paste(
+      "clickAdId = impressionAdId AND",
+      "clickTime >= impressionTime AND",
+      "clickTime <= impressionTime + interval 1 hour"),
+  "left_outer"                 # can be "inner", "left_outer", "right_outer"
+))
+
+{% endhighlight %}
+
+</div>
 </div>
 
 However, note that the outer NULL results will be generated with a delay (depends on the specified
@@ -1441,16 +1497,30 @@ streamingDf
 {% highlight python %}
 streamingDf = spark.readStream. ...
 
-// Without watermark using guid column
+# Without watermark using guid column
 streamingDf.dropDuplicates("guid")
 
-// With watermark using guid and eventTime columns
+# With watermark using guid and eventTime columns
 streamingDf \
   .withWatermark("eventTime", "10 seconds") \
   .dropDuplicates("guid", "eventTime")
 {% endhighlight %}
 
 </div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+streamingDf <- read.stream(...)
+
+# Without watermark using guid column
+streamingDf <- dropDuplicates(streamingDf, "guid")
+
+# With watermark using guid and eventTime columns
+streamingDf <- withWatermark(streamingDf, "eventTime", "10 seconds")
+streamingDf <- dropDuplicates(streamingDf, "guid", "eventTime")
+{% endhighlight %}
+
+</div>
 </div>
 
 ### Arbitrary Stateful Operations


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org