You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by mw...@apache.org on 2020/05/05 09:10:55 UTC
[beam] 01/01: [BEAM-9634] Add natural language analysis transform
This is an automated email from the ASF dual-hosted git repository.
mwalenia pushed a commit to branch BEAM-9634-naturallanguage-java
in repository https://gitbox.apache.org/repos/asf/beam.git
commit 11cf385f4329780d3b2c07189cf6399f91ecb9e9
Author: Michal Walenia <mi...@polidea.com>
AuthorDate: Tue May 5 11:09:49 2020 +0200
[BEAM-9634] Add natural language analysis transform
---
sdks/java/extensions/ml/build.gradle | 4 +-
.../beam/sdk/extensions/ml/AnnotateText.java | 89 ++++++++++++++++++++++
.../beam/sdk/extensions/ml/AnnotateTextIT.java | 66 ++++++++++++++++
3 files changed, 158 insertions(+), 1 deletion(-)
diff --git a/sdks/java/extensions/ml/build.gradle b/sdks/java/extensions/ml/build.gradle
index 274c074..1f61c6a 100644
--- a/sdks/java/extensions/ml/build.gradle
+++ b/sdks/java/extensions/ml/build.gradle
@@ -26,10 +26,12 @@ description = 'Apache Beam :: SDKs :: Java :: Extensions :: ML'
dependencies {
compile project(path: ":sdks:java:core", configuration: "shadow")
compile project(":sdks:java:expansion-service")
- testCompile project(path: ':sdks:java:core', configuration: 'shadowTest')
compile 'com.google.cloud:google-cloud-video-intelligence:1.2.0'
+ compile 'com.google.cloud:google-cloud-language:1.99.4'
+ testCompile project(path: ':sdks:java:core', configuration: 'shadowTest')
testCompile library.java.mockito_core
testCompile 'com.google.cloud:google-cloud-video-intelligence:1.2.0'
+ testCompile 'com.google.cloud:google-cloud-language:1.99.4'
testCompile library.java.junit
testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow")
testRuntimeOnly project(":runners:google-cloud-dataflow-java")
diff --git a/sdks/java/extensions/ml/src/main/java/org/apache/beam/sdk/extensions/ml/AnnotateText.java b/sdks/java/extensions/ml/src/main/java/org/apache/beam/sdk/extensions/ml/AnnotateText.java
new file mode 100644
index 0000000..94488ea
--- /dev/null
+++ b/sdks/java/extensions/ml/src/main/java/org/apache/beam/sdk/extensions/ml/AnnotateText.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.ml;
+
+import com.google.auto.value.AutoValue;
+import com.google.cloud.language.v1.AnnotateTextRequest;
+import com.google.cloud.language.v1.AnnotateTextResponse;
+import com.google.cloud.language.v1.Document;
+import com.google.cloud.language.v1.LanguageServiceClient;
+import java.io.IOException;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.annotations.Experimental;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.PCollection;
+
+/**
+ * A {@link PTransform} using the Cloud AI Natural language processing capability. Takes an input
+ * {@link PCollection} of {@link Document}s and converts them to {@link AnnotateTextResponse}s.
+ *
+ * <p>It is possible to provide a language hint to the API. A {@link
+ * com.google.cloud.language.v1.AnnotateTextRequest.Features} object is required to configure
+ * analysis types to be done on the data.
+ */
+@Experimental
+@AutoValue
+public abstract class AnnotateText
+ extends PTransform<PCollection<Document>, PCollection<AnnotateTextResponse>> {
+
+ @Nullable
+ public abstract String languageHint();
+
+ public abstract AnnotateTextRequest.Features features();
+
+ @AutoValue.Builder
+ public abstract static class Builder {
+ public abstract Builder setLanguageHint(String hint);
+
+ public abstract Builder setFeatures(AnnotateTextRequest.Features features);
+
+ public abstract AnnotateText build();
+ }
+
+ public static Builder newBuilder() {
+ return new AutoValue_AnnotateText.Builder();
+ }
+
+ @Override
+ public PCollection<AnnotateTextResponse> expand(PCollection<Document> input) {
+ return input.apply(ParDo.of(new CallLanguageApi(languageHint(), features())));
+ }
+
+ private static class CallLanguageApi extends DoFn<Document, AnnotateTextResponse> {
+ private final String languageHint;
+ private final AnnotateTextRequest.Features features;
+
+ private CallLanguageApi(String languageHint, AnnotateTextRequest.Features features) {
+ this.languageHint = languageHint;
+ this.features = features;
+ }
+
+ @ProcessElement
+ public void processElement(ProcessContext context) throws IOException {
+ try (LanguageServiceClient client = LanguageServiceClient.create()) {
+ Document element = context.element();
+ if (languageHint != null) {
+ element = element.toBuilder().setLanguage(languageHint).build();
+ }
+ context.output(client.annotateText(element, features));
+ }
+ }
+ }
+}
diff --git a/sdks/java/extensions/ml/src/test/java/org/apache/beam/sdk/extensions/ml/AnnotateTextIT.java b/sdks/java/extensions/ml/src/test/java/org/apache/beam/sdk/extensions/ml/AnnotateTextIT.java
new file mode 100644
index 0000000..863ae11
--- /dev/null
+++ b/sdks/java/extensions/ml/src/test/java/org/apache/beam/sdk/extensions/ml/AnnotateTextIT.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.ml;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.cloud.language.v1.AnnotateTextRequest;
+import com.google.cloud.language.v1.AnnotateTextResponse;
+import com.google.cloud.language.v1.Document;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.PCollection;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class AnnotateTextIT {
+ @Rule public TestPipeline testPipeline = TestPipeline.create();
+ private static final String TEST_STRING = "Hello, world!";
+
+ @Test
+ public void analyzesLanguage() {
+ Document doc = Document.newBuilder().setContent(TEST_STRING).build();
+ AnnotateTextRequest.Features features =
+ AnnotateTextRequest.Features.newBuilder().setExtractSyntax(true).build();
+ PCollection<AnnotateTextResponse> responses =
+ testPipeline
+ .apply(Create.of(doc))
+ .apply(AnnotateText.newBuilder().setFeatures(features).build());
+ PAssert.that(responses).satisfies(new VerifyTextAnnotationResult());
+ }
+
+ private static class VerifyTextAnnotationResult
+ implements SerializableFunction<Iterable<AnnotateTextResponse>, Void> {
+
+ @Override
+ public Void apply(Iterable<AnnotateTextResponse> input) {
+ List<Boolean> labelEvaluations = new ArrayList<>();
+ input.forEach(
+ response -> labelEvaluations.add(response.getLanguage().equalsIgnoreCase("en")));
+ assertEquals(Boolean.TRUE, labelEvaluations.contains(Boolean.TRUE));
+ return null;
+ }
+ }
+}