You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by al...@apache.org on 2020/08/06 17:29:47 UTC
[beam] branch master updated: [BEAM-9421] Add Java snippets to NLP documentation.

This is an automated email from the ASF dual-hosted git repository.

altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new a76cf13  [BEAM-9421] Add Java snippets to NLP documentation.
     new fdb3986  Merge pull request #12453 from mwalenia/BEAM-9421-nlp-transform-java-website-doc
a76cf13 is described below

commit a76cf139b6bd5242be31d22f85f23cafce100612
Author: Michal Walenia <mi...@polidea.com>
AuthorDate: Mon Aug 3 16:16:49 2020 +0200

    [BEAM-9421] Add Java snippets to NLP documentation.
---
 examples/java/build.gradle                         |   3 +
 .../apache/beam/examples/snippets/Snippets.java    | 174 +++++++++++++++++++++
 .../en/documentation/patterns/ai-platform.md       |   8 +-
 3 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/examples/java/build.gradle b/examples/java/build.gradle
index 0c91573..eaad35f 100644
--- a/examples/java/build.gradle
+++ b/examples/java/build.gradle
@@ -52,6 +52,7 @@ dependencies {
   compile project(path: ":sdks:java:core", configuration: "shadow")
   compile project(":sdks:java:extensions:google-cloud-platform-core")
   compile project(":sdks:java:io:google-cloud-platform")
+  compile project(":sdks:java:extensions:ml")
   compile library.java.avro
   compile library.java.bigdataoss_util
   compile library.java.google_api_client
@@ -60,6 +61,7 @@ dependencies {
   compile library.java.google_auth_library_credentials
   compile library.java.google_auth_library_oauth2_http
   compile library.java.google_cloud_datastore_v1_proto_client
+  compile library.java.google_code_gson
   compile library.java.google_http_client
   compile library.java.joda_time
   compile library.java.proto_google_cloud_datastore_v1
@@ -67,6 +69,7 @@ dependencies {
   compile library.java.slf4j_jdk14
   runtime project(path: ":runners:direct-java", configuration: "shadow")
   testCompile project(":sdks:java:io:google-cloud-platform")
+  testCompile project(":sdks:java:extensions:ml")
   testCompile library.java.hamcrest_core
   testCompile library.java.hamcrest_library
   testCompile library.java.junit
diff --git a/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java b/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java
index 3393e6d..cb55475f 100644
--- a/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java
+++ b/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java
@@ -22,6 +22,14 @@ import com.google.api.services.bigquery.model.TableReference;
 import com.google.api.services.bigquery.model.TableRow;
 import com.google.api.services.bigquery.model.TableSchema;
 import com.google.api.services.bigquery.model.TimePartitioning;
+import com.google.cloud.language.v1.AnnotateTextRequest;
+import com.google.cloud.language.v1.AnnotateTextResponse;
+import com.google.cloud.language.v1.Document;
+import com.google.cloud.language.v1.Entity;
+import com.google.cloud.language.v1.Sentence;
+import com.google.cloud.language.v1.Token;
+import com.google.gson.Gson;
+import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Base64;
@@ -30,12 +38,14 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.coders.AvroCoder;
 import org.apache.beam.sdk.coders.Coder;
 import org.apache.beam.sdk.coders.DefaultCoder;
 import org.apache.beam.sdk.coders.DoubleCoder;
+import org.apache.beam.sdk.extensions.ml.AnnotateText;
 import org.apache.beam.sdk.io.Compression;
 import org.apache.beam.sdk.io.FileIO;
 import org.apache.beam.sdk.io.GenerateSequence;
@@ -63,6 +73,7 @@ import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.MapElements;
 import org.apache.beam.sdk.transforms.ParDo;
 import org.apache.beam.sdk.transforms.PeriodicImpulse;
+import org.apache.beam.sdk.transforms.SerializableFunction;
 import org.apache.beam.sdk.transforms.Sum;
 import org.apache.beam.sdk.transforms.View;
 import org.apache.beam.sdk.transforms.Watch;
@@ -984,4 +995,167 @@ public class Snippets {
       return result;
     }
   }
+
+  public static class NaturalLanguageIntegration {
+    private static final SerializableFunction<AnnotateTextResponse, List<Map<String, List<String>>>>
+        // [START NlpAnalyzeDependencyTree]
+        analyzeDependencyTree =
+            (SerializableFunction<AnnotateTextResponse, List<Map<String, List<String>>>>)
+                response -> {
+                  List<Map<String, List<String>>> adjacencyLists = new ArrayList<>();
+                  int index = 0;
+                  for (Sentence s : response.getSentencesList()) {
+                    Map<String, List<String>> adjacencyMap = new HashMap<>();
+                    int sentenceBegin = s.getText().getBeginOffset();
+                    int sentenceEnd = sentenceBegin + s.getText().getContent().length() - 1;
+                    while (index < response.getTokensCount()
+                        && response.getTokens(index).getText().getBeginOffset() <= sentenceEnd) {
+                      Token token = response.getTokensList().get(index);
+                      int headTokenIndex = token.getDependencyEdge().getHeadTokenIndex();
+                      String headTokenContent =
+                          response.getTokens(headTokenIndex).getText().getContent();
+                      List<String> adjacencyList =
+                          adjacencyMap.getOrDefault(headTokenContent, new ArrayList<>());
+                      adjacencyList.add(token.getText().getContent());
+                      adjacencyMap.put(headTokenContent, adjacencyList);
+                      index++;
+                    }
+                    adjacencyLists.add(adjacencyMap);
+                  }
+                  return adjacencyLists;
+                };
+    // [END NlpAnalyzeDependencyTree]
+
+    private static final SerializableFunction<? super AnnotateTextResponse, TextSentiments>
+        // [START NlpExtractSentiments]
+        extractSentiments =
+        (SerializableFunction<AnnotateTextResponse, TextSentiments>)
+            annotateTextResponse -> {
+              TextSentiments sentiments = new TextSentiments();
+              sentiments.setDocumentSentiment(
+                  annotateTextResponse.getDocumentSentiment().getMagnitude());
+              Map<String, Float> sentenceSentimentsMap =
+                  annotateTextResponse.getSentencesList().stream()
+                      .collect(
+                          Collectors.toMap(
+                              (Sentence s) -> s.getText().getContent(),
+                              (Sentence s) -> s.getSentiment().getMagnitude()));
+              sentiments.setSentenceSentiments(sentenceSentimentsMap);
+              return sentiments;
+            };
+    // [END NlpExtractSentiments]
+
+    private static final SerializableFunction<? super AnnotateTextResponse, Map<String, String>>
+        // [START NlpExtractEntities]
+        extractEntities =
+        (SerializableFunction<AnnotateTextResponse, Map<String, String>>)
+            annotateTextResponse ->
+                annotateTextResponse.getEntitiesList().stream()
+                    .collect(
+                        Collectors.toMap(Entity::getName, (Entity e) -> e.getType().toString()));
+    // [END NlpExtractEntities]
+
+    private static final SerializableFunction<? super Map<String, String>, String>
+        mapEntitiesToJson =
+            (SerializableFunction<Map<String, String>, String>)
+                item -> {
+                  StringBuilder builder = new StringBuilder("[");
+                  builder.append(
+                      item.entrySet().stream()
+                          .map(
+                              entry -> "{\"" + entry.getKey() + "\": \"" + entry.getValue() + "\"}")
+                          .collect(Collectors.joining(",")));
+                  builder.append("]");
+                  return builder.toString();
+                };
+
+    private static final SerializableFunction<List<Map<String, List<String>>>, String>
+        mapDependencyTreesToJson =
+            (SerializableFunction<List<Map<String, List<String>>>, String>)
+                tree -> {
+                  Gson gson = new Gson();
+                  return gson.toJson(tree);
+                };
+
+    public static void main(Pipeline p) {
+      // [START NlpAnalyzeText]
+      AnnotateTextRequest.Features features =
+          AnnotateTextRequest.Features.newBuilder()
+              .setExtractEntities(true)
+              .setExtractDocumentSentiment(true)
+              .setExtractEntitySentiment(true)
+              .setExtractSyntax(true)
+              .build();
+      AnnotateText annotateText = AnnotateText.newBuilder().setFeatures(features).build();
+
+      PCollection<AnnotateTextResponse> responses =
+          p.apply(
+                  Create.of(
+                      "My experience so far has been fantastic, "
+                          + "I\'d really recommend this product."))
+              .apply(
+                  MapElements.into(TypeDescriptor.of(Document.class))
+                      .via(
+                          (SerializableFunction<String, Document>)
+                              input ->
+                                  Document.newBuilder()
+                                      .setContent(input)
+                                      .setType(Document.Type.PLAIN_TEXT)
+                                      .build()))
+              .apply(annotateText);
+
+      responses
+          .apply(MapElements.into(TypeDescriptor.of(TextSentiments.class)).via(extractSentiments))
+          .apply(
+              MapElements.into(TypeDescriptors.strings())
+                  .via((SerializableFunction<TextSentiments, String>) TextSentiments::toJson))
+          .apply(TextIO.write().to("sentiments.txt"));
+
+      responses
+          .apply(
+              MapElements.into(
+                      TypeDescriptors.maps(TypeDescriptors.strings(), TypeDescriptors.strings()))
+                  .via(extractEntities))
+          .apply(MapElements.into(TypeDescriptors.strings()).via(mapEntitiesToJson))
+          .apply(TextIO.write().to("entities.txt"));
+
+      responses
+          .apply(
+              MapElements.into(
+                      TypeDescriptors.lists(
+                          TypeDescriptors.maps(
+                              TypeDescriptors.strings(),
+                              TypeDescriptors.lists(TypeDescriptors.strings()))))
+                  .via(analyzeDependencyTree))
+          .apply(MapElements.into(TypeDescriptors.strings()).via(mapDependencyTreesToJson))
+          .apply(TextIO.write().to("adjacency_list.txt"));
+      // [END NlpAnalyzeText]
+    }
+
+    private static class TextSentiments implements Serializable {
+      private Float documentSentiment;
+      private Map<String, Float> sentenceSentiments;
+
+      public void setSentenceSentiments(Map<String, Float> sentenceSentiments) {
+        this.sentenceSentiments = sentenceSentiments;
+      }
+
+      public Float getDocumentSentiment() {
+        return documentSentiment;
+      }
+
+      public void setDocumentSentiment(Float documentSentiment) {
+        this.documentSentiment = documentSentiment;
+      }
+
+      public Map<String, Float> getSentenceSentiments() {
+        return sentenceSentiments;
+      }
+
+      public String toJson() {
+        Gson gson = new Gson();
+        return gson.toJson(this);
+      }
+    }
+  }
 }
diff --git a/website/www/site/content/en/documentation/patterns/ai-platform.md b/website/www/site/content/en/documentation/patterns/ai-platform.md
index 905f895..b2a7b10 100644
--- a/website/www/site/content/en/documentation/patterns/ai-platform.md
+++ b/website/www/site/content/en/documentation/patterns/ai-platform.md
@@ -35,7 +35,7 @@ Here is an example of a pipeline that creates in-memory PCollection of strings,
 {{< /highlight >}}
 
 {{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpAnalyzeText >}}
 {{< /highlight >}}
 
 
@@ -79,7 +79,7 @@ The function for extracting information about sentence-level and document-level
 {{< /highlight >}}
 
 {{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpExtractSentiments >}}
 {{< /highlight >}}
 
 The snippet loops over `sentences` and, for each sentence, extracts the sentiment score. 
@@ -99,7 +99,7 @@ The next function inspects the response for entities and returns the names and t
 {{< /highlight >}}
 
 {{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpExtractEntities >}}
 {{< /highlight >}}
 
 Entities can be found in `entities` attribute. Just like before, `entities` is a sequence, that's why list comprehension is a viable choice. The most tricky part is interpreting the types of entities. Natural Language API defines entity types as enum. In a response object, entity types are returned as integers. That's why a user has to instantiate `naturallanguageml.enums.Entity.Type` to access a human-readable name.
@@ -119,7 +119,7 @@ The following code loops over the sentences and, for each sentence, builds an ad
 {{< /highlight >}}
 
 {{< highlight java >}}
-// Java examples will be available on Beam 2.23 release.
+{{< code_sample "examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" NlpAnalyzeDependencyTree >}}
 {{< /highlight >}}
 
 The output is below. For better readability, indexes are replaced by text which they refer to: