You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by al...@apache.org on 2017/04/28 16:34:10 UTC
[1/2] beam git commit: [BEAM-2094] WordCount examples produce garbage
for non-English input text
Repository: beam
Updated Branches:
refs/heads/master d08b72892 -> bff84e27c
[BEAM-2094] WordCount examples produce garbage for non-English input text
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/594705d7
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/594705d7
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/594705d7
Branch: refs/heads/master
Commit: 594705d717dd6e5b81c08dcf4c56f8f6e7422737
Parents: d08b728
Author: Peter Gergo Barna <pb...@hortonworks.com>
Authored: Thu Apr 27 14:37:26 2017 +0200
Committer: Ahmet Altay <al...@google.com>
Committed: Fri Apr 28 09:33:13 2017 -0700
----------------------------------------------------------------------
.../main/java/org/apache/beam/examples/MinimalWordCount.java | 3 ++-
.../src/main/java/org/apache/beam/examples/WordCount.java | 3 ++-
.../java/org/apache/beam/examples/common/ExampleUtils.java | 8 ++++++++
.../apache/beam/examples/complete/StreamingWordExtract.java | 2 +-
.../java/org/apache/beam/examples/WindowedWordCountIT.java | 4 +++-
.../java/org/apache/beam/examples/MinimalWordCountJava8.java | 3 +--
.../org/apache/beam/runners/spark/examples/WordCount.java | 2 +-
7 files changed, 18 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java b/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
index 6085539..cf72672 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/MinimalWordCount.java
@@ -17,6 +17,7 @@
*/
package org.apache.beam.examples;
+import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
@@ -83,7 +84,7 @@ public class MinimalWordCount {
.apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) {
- for (String word : c.element().split("[^a-zA-Z']+")) {
+ for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
if (!word.isEmpty()) {
c.output(word);
}
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/WordCount.java b/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
index 0c786bc..58720b8 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/WordCount.java
@@ -17,6 +17,7 @@
*/
package org.apache.beam.examples;
+import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.metrics.Counter;
@@ -95,7 +96,7 @@ public class WordCount {
}
// Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
+ String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN);
// Output each word encountered into the output PCollection.
for (String word : words) {
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java b/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
index 6ac37fd..409085a 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/common/ExampleUtils.java
@@ -66,6 +66,14 @@ public class ExampleUtils {
private static final int SC_NOT_FOUND = 404;
+ /**
+ * \p{L} denotes the category of Unicode letters,
+ * so this pattern will match on everything that is not a letter.
+ *
+ * <p>It is used for tokenizing strings in the wordcount examples.
+ */
+ public static final String TOKENIZER_PATTERN = "[^\\p{L}]+";
+
private final PipelineOptions options;
private Bigquery bigQueryClient = null;
private Pubsub pubsubClient = null;
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java b/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
index 20cee01..21a9849 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/complete/StreamingWordExtract.java
@@ -57,7 +57,7 @@ public class StreamingWordExtract {
static class ExtractWords extends DoFn<String, String> {
@ProcessElement
public void processElement(ProcessContext c) {
- String[] words = c.element().split("[^a-zA-Z']+");
+ String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN);
for (String word : words) {
if (!word.isEmpty()) {
c.output(word);
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java b/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
index 857f1d3..a53a151 100644
--- a/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
+++ b/examples/java/src/test/java/org/apache/beam/examples/WindowedWordCountIT.java
@@ -30,6 +30,8 @@ import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ThreadLocalRandom;
+
+import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.examples.common.WriteOneFilePerWindow.PerWindowFiles;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
@@ -152,7 +154,7 @@ public class WindowedWordCountIT {
SortedMap<String, Long> expectedWordCounts = new TreeMap<>();
for (String line :
inputFile.readFilesWithRetries(Sleeper.DEFAULT, BACK_OFF_FACTORY.backoff())) {
- String[] words = line.split("[^a-zA-Z']+");
+ String[] words = line.split(ExampleUtils.TOKENIZER_PATTERN);
for (String word : words) {
if (!word.isEmpty()) {
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java b/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
index f424a7b..6badb75 100644
--- a/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
+++ b/examples/java8/src/main/java/org/apache/beam/examples/MinimalWordCountJava8.java
@@ -58,13 +58,12 @@ public class MinimalWordCountJava8 {
p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*"))
.apply(FlatMapElements
.into(TypeDescriptors.strings())
- .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+"))))
+ .via((String word) -> Arrays.asList(word.split("[^\\p{L}]+"))))
.apply(Filter.by((String word) -> !word.isEmpty()))
.apply(Count.<String>perElement())
.apply(MapElements
.into(TypeDescriptors.strings())
.via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
-
// CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
.apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
http://git-wip-us.apache.org/repos/asf/beam/blob/594705d7/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
index de5ae48..0e6faad 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/examples/WordCount.java
@@ -54,7 +54,7 @@ public class WordCount {
}
// Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
+ String[] words = c.element().split("[^\\p{L}]+");
// Output each word encountered into the output PCollection.
for (String word : words) {
[2/2] beam git commit: This closes #2740
Posted by al...@apache.org.
This closes #2740
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/bff84e27
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/bff84e27
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/bff84e27
Branch: refs/heads/master
Commit: bff84e27ce9a479d0ad0ea51d2cd08bcc970af8e
Parents: d08b728 594705d
Author: Ahmet Altay <al...@google.com>
Authored: Fri Apr 28 09:33:49 2017 -0700
Committer: Ahmet Altay <al...@google.com>
Committed: Fri Apr 28 09:33:49 2017 -0700
----------------------------------------------------------------------
.../main/java/org/apache/beam/examples/MinimalWordCount.java | 3 ++-
.../src/main/java/org/apache/beam/examples/WordCount.java | 3 ++-
.../java/org/apache/beam/examples/common/ExampleUtils.java | 8 ++++++++
.../apache/beam/examples/complete/StreamingWordExtract.java | 2 +-
.../java/org/apache/beam/examples/WindowedWordCountIT.java | 4 +++-
.../java/org/apache/beam/examples/MinimalWordCountJava8.java | 3 +--
.../org/apache/beam/runners/spark/examples/WordCount.java | 2 +-
7 files changed, 18 insertions(+), 7 deletions(-)
----------------------------------------------------------------------