You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by al...@apache.org on 2017/04/28 16:34:10 UTC

[1/2] beam git commit: [BEAM-2094] WordCount examples produce garbage for non-English input text

Repository: beam
Updated Branches:
  refs/heads/master d08b72892 -> bff84e27c


[BEAM-2094] WordCount examples produce garbage for non-English input text


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/594705d7
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/594705d7
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/594705d7

Branch: refs/heads/master
Commit: 594705d717dd6e5b81c08dcf4c56f8f6e7422737
Parents: d08b728
Author: Peter Gergo Barna <pb...@hortonworks.com>
Authored: Thu Apr 27 14:37:26 2017 +0200
Committer: Ahmet Altay <al...@google.com>
Committed: Fri Apr 28 09:33:13 2017 -0700

----------------------------------------------------------------------
 .../main/java/org/apache/beam/examples/MinimalWordCount.java | 3 ++-
 .../src/main/java/org/apache/beam/examples/WordCount.java    | 3 ++-
 .../java/org/apache/beam/examples/common/ExampleUtils.java   | 8 ++++++++
 .../apache/beam/examples/complete/StreamingWordExtract.java  | 2 +-
 .../java/org/apache/beam/examples/WindowedWordCountIT.java   | 4 +++-
 .../java/org/apache/beam/examples/MinimalWordCountJava8.java | 3 +--
 .../org/apache/beam/runners/spark/examples/WordCount.java    | 2 +-
 7 files changed, 18 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java b/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
index 6085539..cf72672 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
@@ -17,6 +17,7 @@
  */
 package org.apache.beam.examples;
 
+import org.apache.beam.examples.common.ExampleUtils;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.io.TextIO;
 import org.apache.beam.sdk.options.PipelineOptions;
@@ -83,7 +84,7 @@ public class MinimalWordCount {
      .apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
-                         for (String word : c.element().split("[^a-zA-Z']+")) {
+                         for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
                            if (!word.isEmpty()) {
                              c.output(word);
                            }

http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/WordCount.java b/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
index 0c786bc..58720b8 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
@@ -17,6 +17,7 @@
  */
 package org.apache.beam.examples;
 
+import org.apache.beam.examples.common.ExampleUtils;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.io.TextIO;
 import org.apache.beam.sdk.metrics.Counter;
@@ -95,7 +96,7 @@ public class WordCount {
       }
 
       // Split the line into words.
-      String[] words = c.element().split("[^a-zA-Z']+");
+      String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN);
 
       // Output each word encountered into the output PCollection.
       for (String word : words) {

http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java b/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
index 6ac37fd..409085a 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
@@ -66,6 +66,14 @@ public class ExampleUtils {
 
   private static final int SC_NOT_FOUND = 404;
 
+  /**
+   * \p{L} denotes the category of Unicode letters,
+   * so this pattern will match on everything that is not a letter.
+   *
+   * <p>It is used for tokenizing strings in the wordcount examples.
+   */
+  public static final String TOKENIZER_PATTERN = "[^\\p{L}]+";
+
   private final PipelineOptions options;
   private Bigquery bigQueryClient = null;
   private Pubsub pubsubClient = null;

http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java b/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
index 20cee01..21a9849 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
@@ -57,7 +57,7 @@ public class StreamingWordExtract {
   static class ExtractWords extends DoFn<String, String> {
     @ProcessElement
     public void processElement(ProcessContext c) {
-      String[] words = c.element().split("[^a-zA-Z']+");
+      String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN);
       for (String word : words) {
         if (!word.isEmpty()) {
           c.output(word);

http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java b/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
index 857f1d3..a53a151 100644
--- a/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
+++ b/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
@@ -30,6 +30,8 @@ import java.util.List;
 import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.concurrent.ThreadLocalRandom;
+
+import org.apache.beam.examples.common.ExampleUtils;
 import org.apache.beam.examples.common.WriteOneFilePerWindow.PerWindowFiles;
 import org.apache.beam.sdk.PipelineResult;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
@@ -152,7 +154,7 @@ public class WindowedWordCountIT {
     SortedMap<String, Long> expectedWordCounts = new TreeMap<>();
     for (String line :
         inputFile.readFilesWithRetries(Sleeper.DEFAULT, BACK_OFF_FACTORY.backoff())) {
-      String[] words = line.split("[^a-zA-Z']+");
+      String[] words = line.split(ExampleUtils.TOKENIZER_PATTERN);
 
       for (String word : words) {
         if (!word.isEmpty()) {

http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java b/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
index f424a7b..6badb75 100644
--- a/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
+++ b/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
@@ -58,13 +58,12 @@ public class MinimalWordCountJava8 {
     p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*"))
      .apply(FlatMapElements
          .into(TypeDescriptors.strings())
-         .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+"))))
+         .via((String word) -> Arrays.asList(word.split("[^\\p{L}]+"))))
      .apply(Filter.by((String word) -> !word.isEmpty()))
      .apply(Count.<String>perElement())
      .apply(MapElements
          .into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
-
      // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
      .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
 

http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
index de5ae48..0e6faad 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
@@ -54,7 +54,7 @@ public class WordCount {
       }
 
       // Split the line into words.
-      String[] words = c.element().split("[^a-zA-Z']+");
+      String[] words = c.element().split("[^\\p{L}]+");
 
       // Output each word encountered into the output PCollection.
       for (String word : words) {


[2/2] beam git commit: This closes #2740

Posted by al...@apache.org.
This closes #2740


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/bff84e27
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/bff84e27
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/bff84e27

Branch: refs/heads/master
Commit: bff84e27ce9a479d0ad0ea51d2cd08bcc970af8e
Parents: d08b728 594705d
Author: Ahmet Altay <al...@google.com>
Authored: Fri Apr 28 09:33:49 2017 -0700
Committer: Ahmet Altay <al...@google.com>
Committed: Fri Apr 28 09:33:49 2017 -0700

----------------------------------------------------------------------
 .../main/java/org/apache/beam/examples/MinimalWordCount.java | 3 ++-
 .../src/main/java/org/apache/beam/examples/WordCount.java    | 3 ++-
 .../java/org/apache/beam/examples/common/ExampleUtils.java   | 8 ++++++++
 .../apache/beam/examples/complete/StreamingWordExtract.java  | 2 +-
 .../java/org/apache/beam/examples/WindowedWordCountIT.java   | 4 +++-
 .../java/org/apache/beam/examples/MinimalWordCountJava8.java | 3 +--
 .../org/apache/beam/runners/spark/examples/WordCount.java    | 2 +-
 7 files changed, 18 insertions(+), 7 deletions(-)
----------------------------------------------------------------------