You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by mw...@apache.org on 2020/05/05 09:10:55 UTC

[beam] 01/01: [BEAM-9634] Add natural language analysis transform

This is an automated email from the ASF dual-hosted git repository.

mwalenia pushed a commit to branch BEAM-9634-naturallanguage-java
in repository https://gitbox.apache.org/repos/asf/beam.git

commit 11cf385f4329780d3b2c07189cf6399f91ecb9e9
Author: Michal Walenia <mi...@polidea.com>
AuthorDate: Tue May 5 11:09:49 2020 +0200

    [BEAM-9634] Add natural language analysis transform
---
 sdks/java/extensions/ml/build.gradle               |  4 +-
 .../beam/sdk/extensions/ml/AnnotateText.java       | 89 ++++++++++++++++++++++
 .../beam/sdk/extensions/ml/AnnotateTextIT.java     | 66 ++++++++++++++++
 3 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/sdks/java/extensions/ml/build.gradle b/sdks/java/extensions/ml/build.gradle
index 274c074..1f61c6a 100644
--- a/sdks/java/extensions/ml/build.gradle
+++ b/sdks/java/extensions/ml/build.gradle
@@ -26,10 +26,12 @@ description = 'Apache Beam :: SDKs :: Java :: Extensions :: ML'
 dependencies {
     compile project(path: ":sdks:java:core", configuration: "shadow")
     compile project(":sdks:java:expansion-service")
-    testCompile project(path: ':sdks:java:core', configuration: 'shadowTest')
     compile 'com.google.cloud:google-cloud-video-intelligence:1.2.0'
+    compile 'com.google.cloud:google-cloud-language:1.99.4'
+    testCompile project(path: ':sdks:java:core', configuration: 'shadowTest')
     testCompile library.java.mockito_core
     testCompile 'com.google.cloud:google-cloud-video-intelligence:1.2.0'
+    testCompile 'com.google.cloud:google-cloud-language:1.99.4'
     testCompile library.java.junit
     testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow")
     testRuntimeOnly project(":runners:google-cloud-dataflow-java")
diff --git a/sdks/java/extensions/ml/src/main/java/org/apache/beam/sdk/extensions/ml/AnnotateText.java b/sdks/java/extensions/ml/src/main/java/org/apache/beam/sdk/extensions/ml/AnnotateText.java
new file mode 100644
index 0000000..94488ea
--- /dev/null
+++ b/sdks/java/extensions/ml/src/main/java/org/apache/beam/sdk/extensions/ml/AnnotateText.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.ml;
+
+import com.google.auto.value.AutoValue;
+import com.google.cloud.language.v1.AnnotateTextRequest;
+import com.google.cloud.language.v1.AnnotateTextResponse;
+import com.google.cloud.language.v1.Document;
+import com.google.cloud.language.v1.LanguageServiceClient;
+import java.io.IOException;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.annotations.Experimental;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.PCollection;
+
+/**
+ * A {@link PTransform} using the Cloud AI Natural language processing capability. Takes an input
+ * {@link PCollection} of {@link Document}s and converts them to {@link AnnotateTextResponse}s.
+ *
+ * <p>It is possible to provide a language hint to the API. A {@link
+ * com.google.cloud.language.v1.AnnotateTextRequest.Features} object is required to configure
+ * analysis types to be done on the data.
+ */
+@Experimental
+@AutoValue
+public abstract class AnnotateText
+    extends PTransform<PCollection<Document>, PCollection<AnnotateTextResponse>> {
+
+  @Nullable
+  public abstract String languageHint();
+
+  public abstract AnnotateTextRequest.Features features();
+
+  @AutoValue.Builder
+  public abstract static class Builder {
+    public abstract Builder setLanguageHint(String hint);
+
+    public abstract Builder setFeatures(AnnotateTextRequest.Features features);
+
+    public abstract AnnotateText build();
+  }
+
+  public static Builder newBuilder() {
+    return new AutoValue_AnnotateText.Builder();
+  }
+
+  @Override
+  public PCollection<AnnotateTextResponse> expand(PCollection<Document> input) {
+    return input.apply(ParDo.of(new CallLanguageApi(languageHint(), features())));
+  }
+
+  private static class CallLanguageApi extends DoFn<Document, AnnotateTextResponse> {
+    private final String languageHint;
+    private final AnnotateTextRequest.Features features;
+
+    private CallLanguageApi(String languageHint, AnnotateTextRequest.Features features) {
+      this.languageHint = languageHint;
+      this.features = features;
+    }
+
+    @ProcessElement
+    public void processElement(ProcessContext context) throws IOException {
+      try (LanguageServiceClient client = LanguageServiceClient.create()) {
+        Document element = context.element();
+        if (languageHint != null) {
+          element = element.toBuilder().setLanguage(languageHint).build();
+        }
+        context.output(client.annotateText(element, features));
+      }
+    }
+  }
+}
diff --git a/sdks/java/extensions/ml/src/test/java/org/apache/beam/sdk/extensions/ml/AnnotateTextIT.java b/sdks/java/extensions/ml/src/test/java/org/apache/beam/sdk/extensions/ml/AnnotateTextIT.java
new file mode 100644
index 0000000..863ae11
--- /dev/null
+++ b/sdks/java/extensions/ml/src/test/java/org/apache/beam/sdk/extensions/ml/AnnotateTextIT.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.ml;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.cloud.language.v1.AnnotateTextRequest;
+import com.google.cloud.language.v1.AnnotateTextResponse;
+import com.google.cloud.language.v1.Document;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.PCollection;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class AnnotateTextIT {
+  @Rule public TestPipeline testPipeline = TestPipeline.create();
+  private static final String TEST_STRING = "Hello, world!";
+
+  @Test
+  public void analyzesLanguage() {
+    Document doc = Document.newBuilder().setContent(TEST_STRING).build();
+    AnnotateTextRequest.Features features =
+        AnnotateTextRequest.Features.newBuilder().setExtractSyntax(true).build();
+    PCollection<AnnotateTextResponse> responses =
+        testPipeline
+            .apply(Create.of(doc))
+            .apply(AnnotateText.newBuilder().setFeatures(features).build());
+    PAssert.that(responses).satisfies(new VerifyTextAnnotationResult());
+  }
+
+  private static class VerifyTextAnnotationResult
+      implements SerializableFunction<Iterable<AnnotateTextResponse>, Void> {
+
+    @Override
+    public Void apply(Iterable<AnnotateTextResponse> input) {
+      List<Boolean> labelEvaluations = new ArrayList<>();
+      input.forEach(
+          response -> labelEvaluations.add(response.getLanguage().equalsIgnoreCase("en")));
+      assertEquals(Boolean.TRUE, labelEvaluations.contains(Boolean.TRUE));
+      return null;
+    }
+  }
+}