You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by al...@apache.org on 2020/08/06 17:29:47 UTC
[beam] branch master updated: [BEAM-9421] Add Java snippets to NLP
documentation.
This is an automated email from the ASF dual-hosted git repository.
altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new a76cf13 [BEAM-9421] Add Java snippets to NLP documentation.
new fdb3986 Merge pull request #12453 from mwalenia/BEAM-9421-nlp-transform-java-website-doc
a76cf13 is described below
commit a76cf139b6bd5242be31d22f85f23cafce100612
Author: Michal Walenia <mi...@polidea.com>
AuthorDate: Mon Aug 3 16:16:49 2020 +0200
[BEAM-9421] Add Java snippets to NLP documentation.
---
examples/java/build.gradle | 3 +
.../apache/beam/examples/snippets/Snippets.java | 174 +++++++++++++++++++++
.../en/documentation/patterns/ai-platform.md | 8 +-
3 files changed, 181 insertions(+), 4 deletions(-)
diff --git a/examples/java/build.gradle b/examples/java/build.gradle
index 0c91573..eaad35f 100644
--- a/examples/java/build.gradle
+++ b/examples/java/build.gradle
@@ -52,6 +52,7 @@ dependencies {
compile project(path: ":sdks:java:core", configuration: "shadow")
compile project(":sdks:java:extensions:google-cloud-platform-core")
compile project(":sdks:java:io:google-cloud-platform")
+ compile project(":sdks:java:extensions:ml")
compile library.java.avro
compile library.java.bigdataoss_util
compile library.java.google_api_client
@@ -60,6 +61,7 @@ dependencies {
compile library.java.google_auth_library_credentials
compile library.java.google_auth_library_oauth2_http
compile library.java.google_cloud_datastore_v1_proto_client
+ compile library.java.google_code_gson
compile library.java.google_http_client
compile library.java.joda_time
compile library.java.proto_google_cloud_datastore_v1
@@ -67,6 +69,7 @@ dependencies {
compile library.java.slf4j_jdk14
runtime project(path: ":runners:direct-java", configuration: "shadow")
testCompile project(":sdks:java:io:google-cloud-platform")
+ testCompile project(":sdks:java:extensions:ml")
testCompile library.java.hamcrest_core
testCompile library.java.hamcrest_library
testCompile library.java.junit
diff --git a/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java b/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java
index 3393e6d..cb55475f 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java
@@ -22,6 +22,14 @@ import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.api.services.bigquery.model.TimePartitioning;
+import com.google.cloud.language.v1.AnnotateTextRequest;
+import com.google.cloud.language.v1.AnnotateTextResponse;
+import com.google.cloud.language.v1.Document;
+import com.google.cloud.language.v1.Entity;
+import com.google.cloud.language.v1.Sentence;
+import com.google.cloud.language.v1.Token;
+import com.google.gson.Gson;
+import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
@@ -30,12 +38,14 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.coders.DoubleCoder;
+import org.apache.beam.sdk.extensions.ml.AnnotateText;
import org.apache.beam.sdk.io.Compression;
import org.apache.beam.sdk.io.FileIO;
import org.apache.beam.sdk.io.GenerateSequence;
@@ -63,6 +73,7 @@ import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.PeriodicImpulse;
+import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.Watch;
@@ -984,4 +995,167 @@ public class Snippets {
return result;
}
}
+
+ public static class NaturalLanguageIntegration {
+ private static final SerializableFunction<AnnotateTextResponse, List<Map<String, List<String>>>>
+ // [START NlpAnalyzeDependencyTree]
+ analyzeDependencyTree =
+ (SerializableFunction<AnnotateTextResponse, List<Map<String, List<String>>>>)
+ response -> {
+ List<Map<String, List<String>>> adjacencyLists = new ArrayList<>();
+ int index = 0;
+ for (Sentence s : response.getSentencesList()) {
+ Map<String, List<String>> adjacencyMap = new HashMap<>();
+ int sentenceBegin = s.getText().getBeginOffset();
+ int sentenceEnd = sentenceBegin + s.getText().getContent().length() - 1;
+ while (index < response.getTokensCount()
+ && response.getTokens(index).getText().getBeginOffset() <= sentenceEnd) {
+ Token token = response.getTokensList().get(index);
+ int headTokenIndex = token.getDependencyEdge().getHeadTokenIndex();
+ String headTokenContent =
+ response.getTokens(headTokenIndex).getText().getContent();
+ List<String> adjacencyList =
+ adjacencyMap.getOrDefault(headTokenContent, new ArrayList<>());
+ adjacencyList.add(token.getText().getContent());
+ adjacencyMap.put(headTokenContent, adjacencyList);
+ index++;
+ }
+ adjacencyLists.add(adjacencyMap);
+ }
+ return adjacencyLists;
+ };
+ // [END NlpAnalyzeDependencyTree]
+
+ private static final SerializableFunction<? super AnnotateTextResponse, TextSentiments>
+ // [START NlpExtractSentiments]
+ extractSentiments =
+ (SerializableFunction<AnnotateTextResponse, TextSentiments>)
+ annotateTextResponse -> {
+ TextSentiments sentiments = new TextSentiments();
+ sentiments.setDocumentSentiment(
+ annotateTextResponse.getDocumentSentiment().getMagnitude());
+ Map<String, Float> sentenceSentimentsMap =
+ annotateTextResponse.getSentencesList().stream()
+ .collect(
+ Collectors.toMap(
+ (Sentence s) -> s.getText().getContent(),
+ (Sentence s) -> s.getSentiment().getMagnitude()));
+ sentiments.setSentenceSentiments(sentenceSentimentsMap);
+ return sentiments;
+ };
+ // [END NlpExtractSentiments]
+
+ private static final SerializableFunction<? super AnnotateTextResponse, Map<String, String>>
+ // [START NlpExtractEntities]
+ extractEntities =
+ (SerializableFunction<AnnotateTextResponse, Map<String, String>>)
+ annotateTextResponse ->
+ annotateTextResponse.getEntitiesList().stream()
+ .collect(
+ Collectors.toMap(Entity::getName, (Entity e) -> e.getType().toString()));
+ // [END NlpExtractEntities]
+
+ private static final SerializableFunction<? super Map<String, String>, String>
+ mapEntitiesToJson =
+ (SerializableFunction<Map<String, String>, String>)
+ item -> {
+ StringBuilder builder = new StringBuilder("[");
+ builder.append(
+ item.entrySet().stream()
+ .map(
+ entry -> "{\"" + entry.getKey() + "\": \"" + entry.getValue() + "\"}")
+ .collect(Collectors.joining(",")));
+ builder.append("]");
+ return builder.toString();
+ };
+
+ private static final SerializableFunction<List<Map<String, List<String>>>, String>
+ mapDependencyTreesToJson =
+ (SerializableFunction<List<Map<String, List<String>>>, String>)
+ tree -> {
+ Gson gson = new Gson();
+ return gson.toJson(tree);
+ };
+
+ public static void main(Pipeline p) {
+ // [START NlpAnalyzeText]
+ AnnotateTextRequest.Features features =
+ AnnotateTextRequest.Features.newBuilder()
+ .setExtractEntities(true)
+ .setExtractDocumentSentiment(true)
+ .setExtractEntitySentiment(true)
+ .setExtractSyntax(true)
+ .build();
+ AnnotateText annotateText = AnnotateText.newBuilder().setFeatures(features).build();
+
+ PCollection<AnnotateTextResponse> responses =
+ p.apply(
+ Create.of(
+ "My experience so far has been fantastic, "
+ + "I\'d really recommend this product."))
+ .apply(
+ MapElements.into(TypeDescriptor.of(Document.class))
+ .via(
+ (SerializableFunction<String, Document>)
+ input ->
+ Document.newBuilder()
+ .setContent(input)
+ .setType(Document.Type.PLAIN_TEXT)
+ .build()))
+ .apply(annotateText);
+
+ responses
+ .apply(MapElements.into(TypeDescriptor.of(TextSentiments.class)).via(extractSentiments))
+ .apply(
+ MapElements.into(TypeDescriptors.strings())
+ .via((SerializableFunction<TextSentiments, String>) TextSentiments::toJson))
+ .apply(TextIO.write().to("sentiments.txt"));
+
+ responses
+ .apply(
+ MapElements.into(
+ TypeDescriptors.maps(TypeDescriptors.strings(), TypeDescriptors.strings()))
+ .via(extractEntities))
+ .apply(MapElements.into(TypeDescriptors.strings()).via(mapEntitiesToJson))
+ .apply(TextIO.write().to("entities.txt"));
+
+ responses
+ .apply(
+ MapElements.into(
+ TypeDescriptors.lists(
+ TypeDescriptors.maps(
+ TypeDescriptors.strings(),
+ TypeDescriptors.lists(TypeDescriptors.strings()))))
+ .via(analyzeDependencyTree))
+ .apply(MapElements.into(TypeDescriptors.strings()).via(mapDependencyTreesToJson))
+ .apply(TextIO.write().to("adjacency_list.txt"));
+ // [END NlpAnalyzeText]
+ }
+
+ private static class TextSentiments implements Serializable {
+ private Float documentSentiment;
+ private Map<String, Float> sentenceSentiments;
+
+ public void setSentenceSentiments(Map<String, Float> sentenceSentiments) {
+ this.sentenceSentiments = sentenceSentiments;
+ }
+
+ public Float getDocumentSentiment() {
+ return documentSentiment;
+ }
+
+ public void setDocumentSentiment(Float documentSentiment) {
+ this.documentSentiment = documentSentiment;
+ }
+
+ public Map<String, Float> getSentenceSentiments() {
+ return sentenceSentiments;
+ }
+
+ public String toJson() {
+ Gson gson = new Gson();
+ return gson.toJson(this);
+ }
+ }
+ }
}
diff --git a/website/www/site/content/en/documentation/patterns/ai-platform.md b/website/www/site/content/en/documentation/patterns/ai-platform.md
index 905f895..b2a7b10 100644
--- a/website/www/site/content/en/documentation/patterns/ai-platform.md
+++ b/website/www/site/content/en/documentation/patterns/ai-platform.md
@@ -35,7 +35,7 @@ Here is an example of a pipeline that creates in-memory PCollection of strings,
{{< /highlight >}}
{{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpAnalyzeText >}}
{{< /highlight >}}
@@ -79,7 +79,7 @@ The function for extracting information about sentence-level and document-level
{{< /highlight >}}
{{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpExtractSentiments >}}
{{< /highlight >}}
The snippet loops over `sentences` and, for each sentence, extracts the sentiment score.
@@ -99,7 +99,7 @@ The next function inspects the response for entities and returns the names and t
{{< /highlight >}}
{{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpExtractEntities >}}
{{< /highlight >}}
Entities can be found in `entities` attribute. Just like before, `entities` is a sequence, that's why list comprehension is a viable choice. The most tricky part is interpreting the types of entities. Natural Language API defines entity types as enum. In a response object, entity types are returned as integers. That's why a user has to instantiate `naturallanguageml.enums.Entity.Type` to access a human-readable name.
@@ -119,7 +119,7 @@ The following code loops over the sentences and, for each sentence, builds an ad
{{< /highlight >}}
{{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpAnalyzeDependencyTree >}}
{{< /highlight >}}
The output is below. For better readability, indexes are replaced by text which they refer to: