You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2020/12/30 18:00:36 UTC

[GitHub] [beam] TheNeuralBit commented on a change in pull request #13636: [BEAM-11411] [BEAM-11410] Kafka to pub sub E2E test

TheNeuralBit commented on a change in pull request #13636:
URL: https://github.com/apache/beam/pull/13636#discussion_r550277830



##########
File path: examples/java/src/test/java/org/apache/beam/examples/complete/kafkatopubsub/utils/RunKafkaContainer.java
##########
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.examples.complete.kafkatopubsub.utils;
+
+import java.util.UUID;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.apache.kafka.common.serialization.StringSerializer;
+import org.testcontainers.containers.KafkaContainer;
+import org.testcontainers.utility.DockerImageName;
+
+/** Run kafka container in separate thread to produce message. */
+public class RunKafkaContainer {
+
+  private static final String KAFKA_IMAGE_NAME = "confluentinc/cp-kafka:5.4.3";
+  private final String topicName;
+  private final KafkaProducer<String, String> producer;
+  private final String bootstrapServer;
+
+  public RunKafkaContainer(String pubsubMessage) {
+    bootstrapServer = setupKafkaContainer();
+    topicName = "messages-topic";
+    producer =
+        new KafkaProducer<>(
+            ImmutableMap.of(
+                ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,
+                bootstrapServer,
+                ProducerConfig.CLIENT_ID_CONFIG,
+                UUID.randomUUID().toString()),
+            new StringSerializer(),
+            new StringSerializer());
+    Runnable kafkaProducer =
+        () -> {
+          try {
+            producer.send(new ProducerRecord<>(topicName, "testcontainers", pubsubMessage)).get();
+            System.out.println("Producer sent");
+          } catch (ExecutionException | InterruptedException e) {
+            throw new RuntimeException("Something went wrong in kafka producer", e);
+          }
+        };
+    // Without saving `.schedule(...)` result to variable checkframework will fail
+    @SuppressWarnings("unused")
+    ScheduledFuture<?> schedule =
+        Executors.newSingleThreadScheduledExecutor().schedule(kafkaProducer, 10, TimeUnit.SECONDS);

Review comment:
       Could you instead inject the data in the test after the pipeline has started?

##########
File path: examples/java/src/test/java/org/apache/beam/examples/complete/kafkatopubsub/KafkaToPubsubE2ETest.java
##########
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.examples.complete.kafkatopubsub;
+
+import static org.apache.beam.examples.complete.kafkatopubsub.transforms.FormatTransform.readFromKafka;
+
+import com.google.auth.Credentials;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Supplier;
+import org.apache.beam.examples.complete.kafkatopubsub.utils.RunKafkaContainer;
+import org.apache.beam.runners.direct.DirectOptions;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.extensions.gcp.auth.NoopCredentialFactory;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubJsonClient;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions;
+import org.apache.beam.sdk.io.gcp.pubsub.TestPubsubSignal;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.joda.time.Duration;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.testcontainers.containers.PubSubEmulatorContainer;
+import org.testcontainers.utility.DockerImageName;
+
+/** E2E test for {@link KafkaToPubsub} pipeline. */
+public class KafkaToPubsubE2ETest {
+
+  @Rule public final transient TestPipeline pipeline = TestPipeline.fromOptions(OPTIONS);
+  @Rule public transient TestPubsubSignal signal = TestPubsubSignal.fromOptions(OPTIONS);
+
+  private static final String PUBSUB_EMULATOR_IMAGE =
+      "gcr.io/google.com/cloudsdktool/cloud-sdk:316.0.0-emulators";
+  private static final String PUBSUB_MESSAGE = "test pubsub message";
+  private static final String PROJECT_ID = "try-kafka-pubsub";
+  private static final String TOPIC_NAME = "listen-to-kafka";
+  private static final PubsubClient.TopicPath TOPIC_PATH =
+      PubsubClient.topicPathFromName(PROJECT_ID, TOPIC_NAME);
+  private static final PipelineOptions OPTIONS = TestPipeline.testingPipelineOptions();
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    Credentials credentials = NoopCredentialFactory.fromOptions(OPTIONS).getCredential();
+    OPTIONS.as(GcpOptions.class).setGcpCredential(credentials);
+    OPTIONS.as(GcpOptions.class).setProject(PROJECT_ID);
+    setupPubsubContainer(OPTIONS.as(PubsubOptions.class));
+    createPubsubTopicForTest(OPTIONS.as(PubsubOptions.class));
+  }
+
+  @Test
+  public void testKafkaToPubsubE2E() throws IOException {
+    pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);
+
+    RunKafkaContainer rkc = new RunKafkaContainer(PUBSUB_MESSAGE);
+    String bootstrapServer = rkc.getBootstrapServer();
+    String[] kafkaTopicsList = new String[] {rkc.getTopicName()};
+
+    String pubsubTopicPath = TOPIC_PATH.getPath();
+
+    Map<String, Object> kafkaConfig = new HashMap<>();
+    Map<String, String> sslConfig = new HashMap<>();
+
+    PCollection<KV<String, String>> readStrings =
+        pipeline.apply(
+            "readFromKafka",
+            readFromKafka(bootstrapServer, Arrays.asList(kafkaTopicsList), kafkaConfig, sslConfig));
+
+    PCollection<String> readFromPubsub =
+        readStrings
+            .apply(Values.create())
+            .apply("writeToPubSub", PubsubIO.writeStrings().to(pubsubTopicPath))
+            .getPipeline()
+            .apply("readFromPubsub", PubsubIO.readStrings().fromTopic(pubsubTopicPath));
+
+    readFromPubsub.apply(
+        "waitForTestMessage",
+        signal.signalSuccessWhen(
+            readFromPubsub.getCoder(),
+            input -> {
+              if (input == null) {
+                return false;
+              }
+              return input.stream().anyMatch(message -> Objects.equals(message, PUBSUB_MESSAGE));
+            }));
+
+    Supplier<Void> start = signal.waitForStart(Duration.standardSeconds(10));
+    pipeline.apply(signal.signalStart());
+    PipelineResult job = pipeline.run();
+    start.get();
+    signal.waitForSuccess(Duration.standardMinutes(2));
+    try {
+      job.cancel();
+    } catch (IOException | UnsupportedOperationException e) {
+      throw new AssertionError("Could not stop pipeline.", e);
+    }
+  }
+
+  private static void setupPubsubContainer(PubsubOptions options) {
+    PubSubEmulatorContainer emulator =
+        new PubSubEmulatorContainer(DockerImageName.parse(PUBSUB_EMULATOR_IMAGE));
+    emulator.start();
+    String pubsubUrl = emulator.getEmulatorEndpoint();
+    options.setPubsubRootUrl("http://" + pubsubUrl);
+  }
+
+  private static void createPubsubTopicForTest(PubsubOptions options) {
+    try {
+      PubsubClient pubsubClient = PubsubJsonClient.FACTORY.newClient(null, null, options);
+      pubsubClient.createTopic(TOPIC_PATH);
+    } catch (Exception e) {
+      throw new RuntimeException(e);

Review comment:
       You might look at using `TestPubsub` to create the test topic instead of creating it manually. `TestPubsub` also has a method that you can use to check the topic receives some expected messages, which would save you from creating the `readFromPubsub` transform to signal success from within the pipeline: https://github.com/apache/beam/blob/5e17b69630cc61b42523582b6bb94cb8191dcef5/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/TestPubsub.java#L342
   
   It will be tricky to make this work with the pubsub test container though, since we'll need to start the test container _before_ the TestPubsub Rule initializes its topic. This would be really useful infrastructure though as it would allow us to run many other pubsub tests against the fake instead of prod pubsub.

##########
File path: examples/java/src/test/java/org/apache/beam/examples/complete/kafkatopubsub/KafkaToPubsubE2ETest.java
##########
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.examples.complete.kafkatopubsub;
+
+import static org.apache.beam.examples.complete.kafkatopubsub.transforms.FormatTransform.readFromKafka;
+
+import com.google.auth.Credentials;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Supplier;
+import org.apache.beam.examples.complete.kafkatopubsub.utils.RunKafkaContainer;
+import org.apache.beam.runners.direct.DirectOptions;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.extensions.gcp.auth.NoopCredentialFactory;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubJsonClient;
+import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions;
+import org.apache.beam.sdk.io.gcp.pubsub.TestPubsubSignal;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.joda.time.Duration;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.testcontainers.containers.PubSubEmulatorContainer;
+import org.testcontainers.utility.DockerImageName;
+
+/** E2E test for {@link KafkaToPubsub} pipeline. */
+public class KafkaToPubsubE2ETest {
+
+  @Rule public final transient TestPipeline pipeline = TestPipeline.fromOptions(OPTIONS);
+  @Rule public transient TestPubsubSignal signal = TestPubsubSignal.fromOptions(OPTIONS);
+
+  private static final String PUBSUB_EMULATOR_IMAGE =
+      "gcr.io/google.com/cloudsdktool/cloud-sdk:316.0.0-emulators";
+  private static final String PUBSUB_MESSAGE = "test pubsub message";
+  private static final String PROJECT_ID = "try-kafka-pubsub";
+  private static final String TOPIC_NAME = "listen-to-kafka";
+  private static final PubsubClient.TopicPath TOPIC_PATH =
+      PubsubClient.topicPathFromName(PROJECT_ID, TOPIC_NAME);
+  private static final PipelineOptions OPTIONS = TestPipeline.testingPipelineOptions();
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    Credentials credentials = NoopCredentialFactory.fromOptions(OPTIONS).getCredential();
+    OPTIONS.as(GcpOptions.class).setGcpCredential(credentials);
+    OPTIONS.as(GcpOptions.class).setProject(PROJECT_ID);
+    setupPubsubContainer(OPTIONS.as(PubsubOptions.class));
+    createPubsubTopicForTest(OPTIONS.as(PubsubOptions.class));
+  }
+
+  @Test
+  public void testKafkaToPubsubE2E() throws IOException {
+    pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);
+
+    RunKafkaContainer rkc = new RunKafkaContainer(PUBSUB_MESSAGE);
+    String bootstrapServer = rkc.getBootstrapServer();
+    String[] kafkaTopicsList = new String[] {rkc.getTopicName()};
+
+    String pubsubTopicPath = TOPIC_PATH.getPath();
+
+    Map<String, Object> kafkaConfig = new HashMap<>();
+    Map<String, String> sslConfig = new HashMap<>();
+
+    PCollection<KV<String, String>> readStrings =
+        pipeline.apply(
+            "readFromKafka",
+            readFromKafka(bootstrapServer, Arrays.asList(kafkaTopicsList), kafkaConfig, sslConfig));
+
+    PCollection<String> readFromPubsub =
+        readStrings
+            .apply(Values.create())
+            .apply("writeToPubSub", PubsubIO.writeStrings().to(pubsubTopicPath))
+            .getPipeline()
+            .apply("readFromPubsub", PubsubIO.readStrings().fromTopic(pubsubTopicPath));

Review comment:
       It would be preferable to run the exact KafkaToPubsub pipeline, then use utilities outside of the pipeline to inject data to the kafka topic, and then to verify the pubsub topic receives the expected messages. As noted in my other comment `TestPubsub` can help with the latter.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org